TerrifiedBug · TerrifiedBug · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/docs/public/user-guide/pipelines.md b/docs/public/user-guide/pipelines.md
@@ -12,6 +12,7 @@ Pipelines are displayed in a table with the following columns:
 |--------|------------|
 | **Name** | The pipeline name. Click it to open the pipeline in the editor. |
 | **Status** | Current lifecycle state (see statuses below). |
+| **Health** | SLI health badge -- green **Healthy**, yellow **Degraded**, or gray **No SLIs** (see [Pipeline Health SLIs](#pipeline-health-slis) below). |
 | **Events/sec In** | Live event ingestion rate polled from the agent fleet. |
 | **Bytes/sec In** | Live byte ingestion rate. |
 | **Reduction** | Percentage of events reduced by transforms, color-coded green (>50%), amber (>10%), or neutral. |
@@ -69,6 +70,57 @@ Every time you deploy a pipeline, a new **version** is created that captures the
 
 The pipeline list shows a **Pending deploy** badge when the saved configuration differs from the most recently deployed version, so you always know if there are undeployed changes.
 
+## Pipeline Health SLIs
+
+Service Level Indicators (SLIs) let you define health thresholds for your deployed pipelines. When SLIs are configured, VectorFlow continuously evaluates pipeline metrics against your thresholds and displays the result as a health badge in the pipeline list and pipeline editor toolbar.
+
+### Health badges
+
+| Badge | Meaning |
+|-------|---------|
+| **Healthy** (green) | All configured SLIs are within their thresholds. |
+| **Degraded** (yellow) | One or more SLIs have breached their threshold. |
+| **No SLIs** (gray) | No SLI definitions have been configured for this pipeline. |
+
+Draft pipelines do not show a health badge since they are not deployed and have no metrics.
+
+### Available metrics
+
+| Metric | Description | Typical condition |
+|--------|-------------|-------------------|
+| **Error Rate** | Ratio of errors to total events ingested (`errorsTotal / eventsIn`). | `< 0.01` (less than 1% errors) |
+| **Discard Rate** | Ratio of discarded events to total events ingested (`eventsDiscarded / eventsIn`). | `< 0.05` (less than 5% discards) |
+| **Throughput Floor** | Events per second averaged over the evaluation window (`eventsIn / windowSeconds`). | `> 100` (at least 100 events/sec) |
+
+### Configuring SLIs
+
+{% stepper %}
+{% step %}
+### Open pipeline settings
+In the pipeline editor, click the **Settings** gear icon in the toolbar to open the settings popover.
+{% endstep %}
+{% step %}
+### Expand Health SLIs
+Click the **Health SLIs** collapsible section at the bottom of the settings panel.
+{% endstep %}
+{% step %}
+### Add an SLI
+Select a **Metric** (Error Rate, Throughput Floor, or Discard Rate), choose a **Condition** (less than or greater than), set a **Threshold** value, and configure the evaluation **Window** in minutes (1--1440). Click **Add SLI** to save.
+{% endstep %}
+{% step %}
+### Review and remove
+Existing SLIs are listed above the form. Click the trash icon to remove an SLI. Changes take effect immediately -- the pipeline list and toolbar health indicators update on the next evaluation cycle.
+{% endstep %}
+{% endstepper %}
+
+{% hint style="info" %}
+Each metric can only have one SLI per pipeline. Adding an SLI for a metric that already has one will update the existing definition.
+{% endhint %}
+
+{% hint style="warning" %}
+If no metric data is available for the evaluation window (for example, the pipeline was recently deployed or has no traffic), the SLI is treated as **breached** and the pipeline health will show as **Degraded**.
+{% endhint %}
-{% hint style="warning" %}
-If no metric data is available for the evaluation window (for example, the pipeline was recently deployed or has no traffic), the SLI is treated as **breached** and the pipeline health will show as **Degraded**.
-{% endhint %}
+{% hint style="warning" %}
+If no metric rows exist for the evaluation window (for example, the pipeline was recently deployed), the SLI is treated as **breached** and the pipeline health will show as **Degraded**.
+
+For rate-based SLIs (`error_rate`, `discard_rate`), if metric rows exist but report zero events ingested, the SLI result is **No Data** — no health determination is made. Pair these SLIs with a `throughput_floor` SLI to catch a stalled pipeline as **Degraded**.
+{% endhint %}
-{% hint style="warning" %}
-If no metric data is available for the evaluation window (for example, the pipeline was recently deployed or has no traffic), the SLI is treated as **breached** and the pipeline health will show as **Degraded**.
-{% endhint %}
+{% hint style="warning" %}
+If no metric rows exist for the evaluation window (for example, the pipeline was recently deployed), the SLI is treated as **breached** and the pipeline health will show as **Degraded**.
+
+For rate-based SLIs (`error_rate`, `discard_rate`), if metric rows exist but report zero events ingested, the SLI result is **No Data** — no health determination is made. Pair these SLIs with a `throughput_floor` SLI to catch a stalled pipeline as **Degraded**.
+{% endhint %}
+
 ## Filtering by environment
 
 Pipelines are scoped to the currently selected **environment** (shown in the sidebar). Switch environments to view pipelines in a different environment. Each environment maintains its own independent set of pipelines, agent nodes, and secrets.
diff --git a/prisma/migrations/20260308000000_add_pipeline_slis/migration.sql b/prisma/migrations/20260308000000_add_pipeline_slis/migration.sql
@@ -0,0 +1,22 @@
+-- CreateTable
+CREATE TABLE "PipelineSli" (
+    "id" TEXT NOT NULL,
+    "pipelineId" TEXT NOT NULL,
+    "metric" TEXT NOT NULL,
+    "condition" TEXT NOT NULL,
+    "threshold" DOUBLE PRECISION NOT NULL,
+    "windowMinutes" INTEGER NOT NULL DEFAULT 5,
+    "enabled" BOOLEAN NOT NULL DEFAULT true,
+    "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+
+    CONSTRAINT "PipelineSli_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE UNIQUE INDEX "PipelineSli_pipelineId_metric_key" ON "PipelineSli"("pipelineId", "metric");
+
+-- CreateIndex
+CREATE INDEX "PipelineSli_pipelineId_idx" ON "PipelineSli"("pipelineId");
+
+-- AddForeignKey
+ALTER TABLE "PipelineSli" ADD CONSTRAINT "PipelineSli_pipelineId_fkey" FOREIGN KEY ("pipelineId") REFERENCES "Pipeline"("id") ON DELETE CASCADE ON UPDATE CASCADE;
diff --git a/prisma/schema.prisma b/prisma/schema.prisma
@@ -213,6 +213,7 @@ model Pipeline {
   alertRules      AlertRule[]
   sampleRequests  EventSampleRequest[]
   eventSamples    EventSample[]
+  slis            PipelineSli[]
   createdAt       DateTime             @default(now())
   updatedAt       DateTime             @updatedAt
 }
@@ -258,6 +259,21 @@ model PipelineMetric {
   @@index([timestamp])
 }
 
+model PipelineSli {
+  id            String   @id @default(cuid())
+  pipelineId    String
+  pipeline      Pipeline @relation(fields: [pipelineId], references: [id], onDelete: Cascade)
+  metric        String   // "error_rate" | "throughput_floor" | "discard_rate"
+  condition     String   // "lt" | "gt"
+  threshold     Float
+  windowMinutes Int      @default(5)
+  enabled       Boolean  @default(true)
+  createdAt     DateTime @default(now())
+
+  @@unique([pipelineId, metric])
+  @@index([pipelineId])
+}
+
 model EventSampleRequest {
   id            String        @id @default(cuid())
   pipelineId    String

diff --git a/src/app/(dashboard)/pipelines/page.tsx b/src/app/(dashboard)/pipelines/page.tsx
@@ -80,6 +80,51 @@ function reductionColor(pct: number): string {
   return "bg-muted text-muted-foreground";
 }
 
+/** Lazily fetches SLI health for a single deployed pipeline. */
+function PipelineHealthBadge({ pipelineId }: { pipelineId: string }) {
+  const trpc = useTRPC();
+  const healthQuery = useQuery(
+    trpc.pipeline.health.queryOptions(
+      { pipelineId },
+      { refetchInterval: 30_000 },
+    ),
+  );
+
+  const status = healthQuery.data?.status ?? null;
+  const hasSlis = (healthQuery.data?.slis.length ?? 0) > 0;
+
+  if (healthQuery.isLoading) {
+    return <Skeleton className="h-5 w-14" />;
+  }
+
+  if (status === "healthy") {
+    return (
+      <Badge variant="outline" className="bg-green-500/15 text-green-700 dark:text-green-400 border-green-500/30">
+        Healthy
+      </Badge>
+    );
+  }
+  if (status === "degraded") {
+    return (
+      <Badge variant="outline" className="bg-yellow-500/15 text-yellow-700 dark:text-yellow-400 border-yellow-500/30">
+        Degraded
+      </Badge>
+    );
+  }
+  if (status === "no_data" && hasSlis) {
+    return (
+      <Badge variant="outline" className="text-muted-foreground">
+        No Data
+      </Badge>
+    );
+  }
+  return (
+    <Badge variant="outline" className="text-muted-foreground">
+      No SLIs
+    </Badge>
+  );
+}
+
 export default function PipelinesPage() {
   const trpc = useTRPC();
   const selectedEnvironmentId = useEnvironmentStore((s) => s.selectedEnvironmentId);
@@ -177,6 +222,7 @@ export default function PipelinesPage() {
             <TableRow>
               <TableHead>Name</TableHead>
               <TableHead>Status</TableHead>
+              <TableHead>Health</TableHead>
               <TableHead className="text-right">Events/sec In</TableHead>
               <TableHead className="text-right">Bytes/sec In</TableHead>
               <TableHead className="text-right">Reduction</TableHead>
@@ -218,6 +264,14 @@ export default function PipelinesPage() {
                   )}
                   </div>
                 </TableCell>
+                {/* Health */}
+                <TableCell>
+                  {pipeline.isDraft ? (
+                    <span className="text-sm text-muted-foreground">--</span>
+                  ) : (
+                    <PipelineHealthBadge pipelineId={pipeline.id} />
+                  )}
+                </TableCell>
                 {/* Events/sec In */}
                 <TableCell className="text-right font-mono text-sm text-muted-foreground">
                   {liveRates[pipeline.id]

diff --git a/src/components/flow/flow-toolbar.tsx b/src/components/flow/flow-toolbar.tsx
@@ -44,7 +44,7 @@ import { cn } from "@/lib/utils";
 import { useFlowStore } from "@/stores/flow-store";
 import { generateVectorYaml, generateVectorToml, importVectorConfig } from "@/lib/config-generator";
 import { useTRPC } from "@/trpc/client";
-import { useMutation } from "@tanstack/react-query";
+import { useMutation, useQuery } from "@tanstack/react-query";
 import { VersionHistoryDialog } from "@/components/pipeline/version-history-dialog";
 
 type ProcessStatusValue = "RUNNING" | "STARTING" | "STOPPED" | "CRASHED" | "PENDING";
@@ -113,6 +113,15 @@ export function FlowToolbar({
   const [versionsOpen, setVersionsOpen] = useState(false);
 
   const trpc = useTRPC();
+
+  const healthQuery = useQuery(
+    trpc.pipeline.health.queryOptions(
+      { pipelineId: pipelineId! },
+      { enabled: !!pipelineId && !isDraft && !!deployedAt, refetchInterval: 30_000 },
+    ),
+  );
+  const healthStatus = healthQuery.data?.status ?? null;
+
   const validateMutation = useMutation(trpc.validator.validate.mutationOptions({
     onSuccess: (result) => {
       if (result.valid) {
@@ -359,7 +368,7 @@ export function FlowToolbar({
             <TooltipContent>Pipeline settings</TooltipContent>
           </Tooltip>
           <PopoverContent align="end" className="w-80">
-            <PipelineSettings />
+            <PipelineSettings pipelineId={pipelineId} />
           </PopoverContent>
         </Popover>
 
@@ -386,6 +395,23 @@ export function FlowToolbar({
               {processStatus === "CRASHED" && "Crashed"}
               {processStatus === "PENDING" && "Pending..."}
             </span>
+            {/* Health SLI indicator dot */}
+            {healthStatus === "healthy" && (
+              <Tooltip>
+                <TooltipTrigger asChild>
+                  <span className="h-2 w-2 rounded-full bg-green-500" />
+                </TooltipTrigger>
+                <TooltipContent>All SLIs met</TooltipContent>
+              </Tooltip>
+            )}
+            {healthStatus === "degraded" && (
+              <Tooltip>
+                <TooltipTrigger asChild>
+                  <span className="h-2 w-2 rounded-full bg-yellow-500" />
+                </TooltipTrigger>
+                <TooltipContent>One or more SLIs breached</TooltipContent>
+              </Tooltip>
+            )}
           </div>
         )}