From 672da1be94f3b6468bd8b2af51fb15e69ff25ac2 Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Fri, 24 Oct 2025 13:47:15 -0600 Subject: [PATCH 01/12] refactor: implment parse as a projection commit 4e5f95f61aae83331c7b27b1b7b8aa274bc6cde3 Author: Trevor Whitney Date: Thu Oct 23 13:47:59 2025 -0600 test: fix planner tests commit dfbdcb7526971ae5006ce8f5a08730adfda2a179 Merge: c9971120d 68df3efd0 Author: Trevor Whitney Date: Thu Oct 23 13:26:39 2025 -0600 Merge branch 'main' into twhitney/refactor-parse commit c9971120d6c5e3c82874c5b58e4eea66914e10a3 Author: Trevor Whitney Date: Thu Oct 23 13:24:03 2025 -0600 chore: fix linting errors commit 037e337654b1becee13f032fcf4f04c27d7441b2 Author: Trevor Whitney Date: Thu Oct 23 12:54:27 2025 -0600 test: fix field names in expression test commit ad6b10175f5c412a54a68dd17a2a14dd25d6b7ed Merge: 79f2cea22 d4c53e9bc Author: Trevor Whitney Date: Thu Oct 23 12:46:34 2025 -0600 Merge branch 'main' into twhitney/refactor-parse commit 79f2cea22dec24e4a391cf848e603fa7e1770491 Author: Trevor Whitney Date: Thu Oct 23 12:44:12 2025 -0600 test: fix workflow planner test commit 40db5efd221155404a8aba25e9999c7c5a5acc98 Author: Trevor Whitney Date: Thu Oct 23 11:38:25 2025 -0600 chore: clena up a few comments commit ad91fda15c4a093be0729b1227ff72d33e45499d Author: Trevor Whitney Date: Thu Oct 23 11:23:19 2025 -0600 refactor: implment parse as a projection --- pkg/engine/internal/executor/executor.go | 14 - pkg/engine/internal/executor/executor_test.go | 24 - pkg/engine/internal/executor/expressions.go | 28 + .../internal/executor/expressions_test.go | 763 ++++++++++++++++-- pkg/engine/internal/executor/functions.go | 43 + pkg/engine/internal/executor/parse.go | 105 +-- pkg/engine/internal/executor/parse_test.go | 730 ----------------- .../internal/planner/logical/builder.go | 39 +- .../planner/logical/builder_convert.go | 17 - .../internal/planner/logical/function_op.go | 42 + .../internal/planner/logical/node_parse.go | 54 -- .../internal/planner/logical/planner.go | 6 +- .../internal/planner/logical/planner_test.go | 26 +- .../internal/planner/physical/expressions.go | 86 +- .../internal/planner/physical/optimizer.go | 65 +- .../planner/physical/optimizer_test.go | 234 ++++-- pkg/engine/internal/planner/physical/parse.go | 55 -- pkg/engine/internal/planner/physical/plan.go | 2 - .../internal/planner/physical/planner.go | 54 +- .../internal/planner/physical/planner_test.go | 100 ++- .../internal/planner/physical/printer.go | 7 - pkg/engine/internal/planner/planner_test.go | 29 + pkg/engine/internal/semconv/identifier.go | 7 + pkg/engine/internal/types/column.go | 5 + pkg/engine/internal/types/literal.go | 30 + pkg/engine/internal/types/operators.go | 24 + pkg/engine/internal/types/types.go | 9 + .../workflow/workflow_planner_test.go | 29 +- 28 files changed, 1407 insertions(+), 1220 deletions(-) delete mode 100644 pkg/engine/internal/executor/parse_test.go create mode 100644 pkg/engine/internal/planner/logical/function_op.go delete mode 100644 pkg/engine/internal/planner/logical/node_parse.go delete mode 100644 pkg/engine/internal/planner/physical/parse.go diff --git a/pkg/engine/internal/executor/executor.go b/pkg/engine/internal/executor/executor.go index 1a8de15c2679a..2964b2a93ff7f 100644 --- a/pkg/engine/internal/executor/executor.go +++ b/pkg/engine/internal/executor/executor.go @@ -101,8 +101,6 @@ func (c *Context) execute(ctx context.Context, node physical.Node) Pipeline { return tracePipeline("physical.RangeAggregation", c.executeRangeAggregation(ctx, n, inputs)) case *physical.VectorAggregation: return tracePipeline("physical.VectorAggregation", c.executeVectorAggregation(ctx, n, inputs)) - case *physical.ParseNode: - return tracePipeline("physical.ParseNode", c.executeParse(ctx, n, inputs)) case *physical.ColumnCompat: return tracePipeline("physical.ColumnCompat", c.executeColumnCompat(ctx, n, inputs)) case *physical.Parallelize: @@ -367,18 +365,6 @@ func (c *Context) executeVectorAggregation(ctx context.Context, plan *physical.V return pipeline } -func (c *Context) executeParse(ctx context.Context, parse *physical.ParseNode, inputs []Pipeline) Pipeline { - if len(inputs) == 0 { - return emptyPipeline() - } - - if len(inputs) > 1 { - return errorPipeline(ctx, fmt.Errorf("parse expects exactly one input, got %d", len(inputs))) - } - - return NewParsePipeline(parse, inputs[0]) -} - func (c *Context) executeColumnCompat(ctx context.Context, compat *physical.ColumnCompat, inputs []Pipeline) Pipeline { if len(inputs) == 0 { return emptyPipeline() diff --git a/pkg/engine/internal/executor/executor_test.go b/pkg/engine/internal/executor/executor_test.go index e7f13dc79900b..87e18b75ae811 100644 --- a/pkg/engine/internal/executor/executor_test.go +++ b/pkg/engine/internal/executor/executor_test.go @@ -87,27 +87,3 @@ func TestExecutor_Projection(t *testing.T) { require.ErrorContains(t, err, "projection expects exactly one input, got 2") }) } - -func TestExecutor_Parse(t *testing.T) { - t.Run("no inputs result in empty pipeline", func(t *testing.T) { - ctx := t.Context() - c := &Context{} - pipeline := c.executeParse(ctx, &physical.ParseNode{ - Kind: physical.ParserLogfmt, - RequestedKeys: []string{"level", "status"}, - }, nil) - _, err := pipeline.Read(ctx) - require.ErrorContains(t, err, EOF.Error()) - }) - - t.Run("multiple inputs result in error", func(t *testing.T) { - ctx := t.Context() - c := &Context{} - pipeline := c.executeParse(ctx, &physical.ParseNode{ - Kind: physical.ParserLogfmt, - RequestedKeys: []string{"level"}, - }, []Pipeline{emptyPipeline(), emptyPipeline()}) - _, err := pipeline.Read(ctx) - require.ErrorContains(t, err, "parse expects exactly one input, got 2") - }) -} diff --git a/pkg/engine/internal/executor/expressions.go b/pkg/engine/internal/executor/expressions.go index afe733b5117c4..bd303750ca0be 100644 --- a/pkg/engine/internal/executor/expressions.go +++ b/pkg/engine/internal/executor/expressions.go @@ -26,6 +26,18 @@ func (e expressionEvaluator) eval(expr physical.Expression, input arrow.Record) case *physical.ColumnExpr: colIdent := semconv.NewIdentifier(expr.Ref.Column, expr.Ref.Type, types.Loki.String) + // For non-ambiguous columns, we can look up the column in the schema by its fully qualified name. + if expr.Ref.Type != types.ColumnTypeAmbiguous { + for idx, field := range input.Schema().Fields() { + ident, err := semconv.ParseFQN(field.Name) + if err != nil { + return nil, fmt.Errorf("failed to parse column %s: %w", field.Name, err) + } + if ident.ShortName() == colIdent.ShortName() && ident.ColumnType() == colIdent.ColumnType() { + return input.Column(idx), nil + case *physical.ColumnExpr: + colIdent := semconv.NewIdentifier(expr.Ref.Column, expr.Ref.Type, types.Loki.String) + // For non-ambiguous columns, we can look up the column in the schema by its fully qualified name. if expr.Ref.Type != types.ColumnTypeAmbiguous { for idx, field := range input.Schema().Fields() { @@ -124,6 +136,22 @@ func (e expressionEvaluator) eval(expr physical.Expression, input arrow.Record) _, lhsIsScalar := expr.Left.(*physical.LiteralExpr) _, rhsIsScalar := expr.Right.(*physical.LiteralExpr) return fn.Evaluate(lhs, rhs, lhsIsScalar, rhsIsScalar) + + case *physical.FunctionExpr: + args := make(arrow.Array, len(expr.Expressions)) + for i, arg := range expr.Expressions { + p, err := e.eval(arg, input) + if err != nil { + return nil, err + } + args[i] = p + } + + fn, err := functions.GetForSignature(expr.Op) + if err != nil { + return nil, fmt.Errorf("failed to lookup unary function: %w", err) + } + return fn.Evaluate(args...) } return nil, fmt.Errorf("unknown expression: %v", expr) diff --git a/pkg/engine/internal/executor/expressions_test.go b/pkg/engine/internal/executor/expressions_test.go index 0aa084261afa9..7758cb4537346 100644 --- a/pkg/engine/internal/executor/expressions_test.go +++ b/pkg/engine/internal/executor/expressions_test.go @@ -84,6 +84,11 @@ func TestEvaluateLiteralExpression(t *testing.T) { want: int64(1024), arrowType: arrow.INT64, }, + { + name: "list", + value: []string{"a", "b", "c"}, + arrowType: arrow.LIST, + }, } { t.Run(tt.name, func(t *testing.T) { literal := physical.NewLiteral(tt.value) @@ -122,6 +127,30 @@ func TestEvaluateLiteralExpression(t *testing.T) { } } +func TestEvaluateNamedListeralListExpression(t *testing.T) { + colMsg := "utf8.builtin.message" + lit := &physical.NamedLiteralExpr{ + Name: "test", + Literal: types.NewLiteral([]string{"a", "b", "c"}), + } + types.NewLiteral([]string{"a", "b", "c"}) + e := newExpressionEvaluator() + schema := arrow.NewSchema([]arrow.Field{ + semconv.FieldFromIdent(semconv.ColumnIdentMessage, false), + }, nil) + input := arrowtest.Rows{ + {colMsg: `some log`}, + {colMsg: `some other log`}, + } + record := input.Record(memory.DefaultAllocator, schema) + result, err := e.eval(lit, record) + require.NoError(t, err) + + require.Equal(t, int64(2), result.Len()) + require.Equal(t, []string{"a", "b", "c"}, result.Value(0)) + require.Equal(t, []string{"a", "b", "c"}, result.Value(1)) +} + func TestEvaluateColumnExpression(t *testing.T) { e := newExpressionEvaluator() @@ -545,51 +574,695 @@ func TestEvaluateUnaryCastExpression(t *testing.T) { require.Equal(t, 3, arr.NumField()) //value, error, errorDetails - // Find the value, error, and errorDetails fields using schema - var valueField *array.Float64 - var errorField, errorDetailsField *array.String - valueField, ok = arr.Field(0).(*array.Float64) - require.True(t, ok) - errorField, ok = arr.Field(1).(*array.String) - require.True(t, ok) - errorDetailsField, ok = arr.Field(2).(*array.String) - require.True(t, ok) + // Convert struct to rows for comparison + actual, err := structToRows(arr) + require.NoError(t, err) - require.NotNil(t, valueField, "expected to find a Float64 value field in struct") - require.NotNil(t, errorField, "expected to find error field in struct") - require.NotNil(t, errorDetailsField, "expected to find errorDetails field in struct") - - // Verify values: first row is valid (42.5), rest are invalid (0.0 with errors) - require.Equal(t, 42.5, valueField.Value(0)) - require.False(t, valueField.IsNull(0)) - require.True(t, errorField.IsNull(0)) - - // Row 1: invalid - "not_a_number" - require.Equal(t, 0.0, valueField.Value(1)) - require.False(t, valueField.IsNull(1)) // Verify value is non-null 0.0 - require.False(t, errorField.IsNull(1)) - require.Equal(t, types.SampleExtractionErrorType, errorField.Value(1)) - require.Contains(t, errorDetailsField.Value(1), `strconv.ParseFloat: parsing "not_a_number": invalid syntax`) - - // Row 2: invalid - "1KB" - require.Equal(t, 0.0, valueField.Value(2)) - require.False(t, valueField.IsNull(2)) // Verify value is non-null 0.0 - require.False(t, errorField.IsNull(2)) - require.Equal(t, types.SampleExtractionErrorType, errorField.Value(2)) - require.Contains(t, errorDetailsField.Value(2), `strconv.ParseFloat: parsing "1KB": invalid syntax`) - - // Row 3: invalid - "invalid_bytes" - require.Equal(t, 0.0, valueField.Value(3)) - require.False(t, valueField.IsNull(3)) // Verify value is non-null 0.0 - require.False(t, errorField.IsNull(3)) - require.Equal(t, types.SampleExtractionErrorType, errorField.Value(3)) - require.Contains(t, errorDetailsField.Value(3), `strconv.ParseFloat: parsing "invalid_bytes": invalid syntax`) - - // Row 4: invalid - empty string - require.Equal(t, 0.0, valueField.Value(4)) - require.False(t, valueField.IsNull(4)) // Verify value is non-null 0.0 - require.False(t, errorField.IsNull(4)) - require.Equal(t, types.SampleExtractionErrorType, errorField.Value(4)) - require.Contains(t, errorDetailsField.Value(4), `strconv.ParseFloat: parsing "": invalid syntax`) + // Define expected output + expected := arrowtest.Rows{ + { + semconv.ColumnIdentValue.FQN(): 42.5, + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + { + semconv.ColumnIdentValue.FQN(): 0.0, + semconv.ColumnIdentError.FQN(): types.SampleExtractionErrorType, + semconv.ColumnIdentErrorDetails.FQN(): `strconv.ParseFloat: parsing "not_a_number": invalid syntax`, + }, + { + semconv.ColumnIdentValue.FQN(): 0.0, + semconv.ColumnIdentError.FQN(): types.SampleExtractionErrorType, + semconv.ColumnIdentErrorDetails.FQN(): `strconv.ParseFloat: parsing "1KB": invalid syntax`, + }, + { + semconv.ColumnIdentValue.FQN(): 0.0, + semconv.ColumnIdentError.FQN(): types.SampleExtractionErrorType, + semconv.ColumnIdentErrorDetails.FQN(): `strconv.ParseFloat: parsing "invalid_bytes": invalid syntax`, + }, + { + semconv.ColumnIdentValue.FQN(): 0.0, + semconv.ColumnIdentError.FQN(): types.SampleExtractionErrorType, + semconv.ColumnIdentErrorDetails.FQN(): `strconv.ParseFloat: parsing "": invalid syntax`, + }, + } + + require.Equal(t, expected, actual) }) } + +func TestEvaluateParseExpression_Logfmt(t *testing.T) { + var ( + colMsg = "utf8.builtin.message" + ) + + for _, tt := range []struct { + name string + schema *arrow.Schema + input arrowtest.Rows + requestedKeys []string + expectedFields int + expectedOutput arrowtest.Rows + }{ + { + name: "parse stage transforms records, adding columns parsed from message", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: "level=error status=500"}, + {colMsg: "level=info status=200"}, + {colMsg: "level=debug status=201"}, + }, + requestedKeys: []string{"level", "status"}, + expectedFields: 2, // 2 columns: level, status + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "error", + "utf8.parsed.status": "500", + }, + { + "utf8.parsed.level": "info", + "utf8.parsed.status": "200", + }, + { + "utf8.parsed.level": "debug", + "utf8.parsed.status": "201", + }, + }, + }, + { + name: "handle missing keys with NULL", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: "level=error"}, + {colMsg: "status=200"}, + {colMsg: "level=info"}, + }, + requestedKeys: []string{"level"}, + expectedFields: 1, // 1 column: level + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "error", + }, + { + "utf8.parsed.level": nil, + }, + { + "utf8.parsed.level": "info", + }, + }, + }, + { + name: "handle errors with error columns", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: "level=info status=200"}, // No errors + {colMsg: "status==value level=error"}, // Double equals error on requested key + {colMsg: "level=\"unclosed status=500"}, // Unclosed quote error + }, + requestedKeys: []string{"level", "status"}, + expectedFields: 4, // 4 columns: level, status, __error__, __error_details__ + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "info", + "utf8.parsed.status": "200", + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + { + "utf8.parsed.level": nil, + "utf8.parsed.status": nil, + semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, + semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 8 : unexpected '='", + }, + { + "utf8.parsed.level": nil, + "utf8.parsed.status": nil, + semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, + semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 27 : unterminated quoted value", + }, + }, + }, + { + name: "extract all keys when none requested", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: "level=info status=200 method=GET"}, + {colMsg: "level=warn code=304"}, + {colMsg: "level=error status=500 method=POST duration=123ms"}, + }, + requestedKeys: nil, // nil means extract all keys + expectedFields: 5, // 5 columns: code, duration, level, method, status + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.code": nil, + "utf8.parsed.duration": nil, + "utf8.parsed.level": "info", + "utf8.parsed.method": "GET", + "utf8.parsed.status": "200", + }, + { + "utf8.parsed.code": "304", + "utf8.parsed.duration": nil, + "utf8.parsed.level": "warn", + "utf8.parsed.method": nil, + "utf8.parsed.status": nil, + }, + { + "utf8.parsed.code": nil, + "utf8.parsed.duration": "123ms", + "utf8.parsed.level": "error", + "utf8.parsed.method": "POST", + "utf8.parsed.status": "500", + }, + }, + }, + { + name: "extract all keys with errors when none requested", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: "level=info status=200 method=GET"}, // Valid line + {colMsg: "level==error code=500"}, // Double equals error + {colMsg: "msg=\"unclosed duration=100ms code=400"}, // Unclosed quote error + {colMsg: "level=debug method=POST"}, // Valid line + }, + requestedKeys: nil, // nil means extract all keys + expectedFields: 5, // 5 columns: level, method, status, __error__, __error_details__ + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "info", + "utf8.parsed.method": "GET", + "utf8.parsed.status": "200", + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + { + "utf8.parsed.level": nil, + "utf8.parsed.method": nil, + "utf8.parsed.status": nil, + semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, + semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 7 : unexpected '='", + }, + { + "utf8.parsed.level": nil, + "utf8.parsed.method": nil, + "utf8.parsed.status": nil, + semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, + semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 38 : unterminated quoted value", + }, + { + "utf8.parsed.level": "debug", + "utf8.parsed.method": "POST", + "utf8.parsed.status": nil, + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + expr := &physical.FunctionExpr{ + Op: types.FunctionOpParseLogfmt, + Expressions: []physical.Expression{ + &physical.ColumnExpr{ + Ref: semconv.ColumnIdentMessage.ColumnRef(), + }, + &physical.NamedLiteralExpr{ + Name: types.ParseRequestedKeys, + Literal: types.NewLiteral(tt.requestedKeys), + }, + }, + } + e := newExpressionEvaluator() + + record := tt.input.Record(memory.DefaultAllocator, tt.schema) + colVec, err := e.eval(expr, record) + require.NoError(t, err) + id := colVec.Type().ArrowType().ID() + require.Equal(t, arrow.STRUCT, id) + + arr, ok := colVec.ToArray().(*array.Struct) + require.True(t, ok) + defer arr.Release() + + require.Equal(t, tt.expectedFields, arr.NumField()) //value, error, errorDetails + + // Convert record to rows for comparison + actual, err := structToRows(arr) + require.NoError(t, err) + require.Equal(t, tt.expectedOutput, actual) + }) + } +} + +func structToRows(structArr *array.Struct) (arrowtest.Rows, error) { + // Get the struct type to extract schema information + structType := structArr.DataType().(*arrow.StructType) + + // Create schema from struct fields + schema := arrow.NewSchema(structType.Fields(), nil) + + // Extract field arrays from the struct + columns := make([]arrow.Array, structArr.NumField()) + for i := 0; i < structArr.NumField(); i++ { + columns[i] = structArr.Field(i) + } + + // Create and return the record + record := array.NewRecord(schema, columns, int64(structArr.Len())) + defer record.Release() + return arrowtest.RecordRows(record) +} + +func TestEvaluateParseExpression_JSON(t *testing.T) { + var ( + colTS = "timestamp_ns.builtin.timestamp" + colMsg = "utf8.builtin.message" + ) + + for _, tt := range []struct { + name string + schema *arrow.Schema + input arrowtest.Rows + requestedKeys []string + expectedFields int + expectedOutput arrowtest.Rows + }{ + { + name: "parse stage transforms records, adding columns parsed from message", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"level": "error", "status": "500"}`}, + {colMsg: `{"level": "info", "status": "200"}`}, + {colMsg: `{"level": "debug", "status": "201"}`}, + }, + requestedKeys: []string{"level", "status"}, + expectedFields: 2, // 2 columns: level, status + expectedOutput: arrowtest.Rows{ + {"utf8.parsed.level": "error", "utf8.parsed.status": "500"}, + {"utf8.parsed.level": "info", "utf8.parsed.status": "200"}, + {"utf8.parsed.level": "debug", "utf8.parsed.status": "201"}, + }, + }, + { + name: "parse stage preserves existing columns", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("timestamp_ns.builtin.timestamp", true), + semconv.FieldFromFQN("utf8.builtin.message", true), + semconv.FieldFromFQN("utf8.label.app", true), + }, nil), + input: arrowtest.Rows{ + {colTS: time.Unix(1, 0).UTC(), colMsg: `{"level": "error", "status": "500"}`, "utf8.label.app": "frontend"}, + {colTS: time.Unix(2, 0).UTC(), colMsg: `{"level": "info", "status": "200"}`, "utf8.label.app": "backend"}, + }, + requestedKeys: []string{"level", "status"}, + expectedFields: 2, // level, status + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "error", + "utf8.parsed.status": "500", + }, + { + "utf8.parsed.level": "info", + "utf8.parsed.status": "200", + }, + }, + }, + { + name: "handle missing keys with NULL", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"level": "error"}`}, + {colMsg: `{"status": "200"}`}, + {colMsg: `{"level": "info"}`}, + }, + requestedKeys: []string{"level"}, + expectedFields: 1, // 1 column: level + expectedOutput: arrowtest.Rows{ + {"utf8.parsed.level": "error"}, + {"utf8.parsed.level": nil}, + {"utf8.parsed.level": "info"}, + }, + }, + { + name: "handle errors with error columns", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"level": "info", "status": "200"}`}, // No errors + {colMsg: `{"level": "error", "status":`}, // Missing closing brace and value + {colMsg: `{"level": "info", "status": 200}`}, // Number should be converted to string + }, + requestedKeys: []string{"level", "status"}, + expectedFields: 4, // 4 columns: level, status, __error__, __error_details__ (due to malformed JSON) + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "info", + "utf8.parsed.status": "200", + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + { + "utf8.parsed.level": nil, + "utf8.parsed.status": nil, + semconv.ColumnIdentError.FQN(): "JSONParserErr", + semconv.ColumnIdentErrorDetails.FQN(): "Malformed JSON error", + }, + { + "utf8.parsed.level": "info", + "utf8.parsed.status": "200", + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + }, + }, + { + name: "extract all keys when none requested", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"level": "info", "status": "200", "method": "GET"}`}, + {colMsg: `{"level": "warn", "code": "304"}`}, + {colMsg: `{"level": "error", "status": "500", "method": "POST", "duration": "123ms"}`}, + }, + requestedKeys: nil, // nil means extract all keys + expectedFields: 5, // 5 columns: message, code, duration, level, method, status + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.code": nil, + "utf8.parsed.duration": nil, + "utf8.parsed.level": "info", + "utf8.parsed.method": "GET", + "utf8.parsed.status": "200", + }, + { + "utf8.parsed.code": "304", + "utf8.parsed.duration": nil, + "utf8.parsed.level": "warn", + "utf8.parsed.method": nil, + "utf8.parsed.status": nil, + }, + { + "utf8.parsed.code": nil, + "utf8.parsed.duration": "123ms", + "utf8.parsed.level": "error", + "utf8.parsed.method": "POST", + "utf8.parsed.status": "500", + }, + }, + }, + { + name: "extract all keys with errors when none requested", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"level": "info", "status": "200", "method": "GET"}`}, // Valid line + {colMsg: `{"level": "error", "code": 500}`}, // Also valid, adds code column + {colMsg: `{"msg": "unclosed}`}, // Unclosed quote + {colMsg: `{"level": "debug", "method": "POST"}`}, // Valid line + }, + requestedKeys: nil, // nil means extract all keys + expectedFields: 6, // 6 columns: level, method, status, code, __error__, __error_details__ + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "info", + "utf8.parsed.method": "GET", + "utf8.parsed.status": "200", + "utf8.parsed.code": nil, + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + { + "utf8.parsed.level": "error", + "utf8.parsed.method": nil, + "utf8.parsed.status": nil, + "utf8.parsed.code": "500", + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + { + "utf8.parsed.level": nil, + "utf8.parsed.method": nil, + "utf8.parsed.status": nil, + "utf8.parsed.code": nil, + semconv.ColumnIdentError.FQN(): "JSONParserErr", + semconv.ColumnIdentErrorDetails.FQN(): "Value is string, but can't find closing '\"' symbol", + }, + { + "utf8.parsed.level": "debug", + "utf8.parsed.method": "POST", + "utf8.parsed.status": nil, + "utf8.parsed.code": nil, + semconv.ColumnIdentError.FQN(): nil, + semconv.ColumnIdentErrorDetails.FQN(): nil, + }, + }, + }, + { + name: "handle nested JSON objects with underscore flattening", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"user": {"name": "john", "details": {"age": "30", "city": "NYC"}}, "status": "active"}`}, + {colMsg: `{"app": {"version": "1.0", "config": {"debug": "true"}}, "level": "info"}`}, + {colMsg: `{"nested": {"deep": {"very": {"deep": "value"}}}}`}, + }, + requestedKeys: nil, // Extract all keys including nested ones + expectedFields: 8, // app_config_debug, app_version, level, nested_deep_very_deep, status, user_details_age, user_details_city, user_name + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.app_config_debug": nil, + "utf8.parsed.app_version": nil, + "utf8.parsed.level": nil, + "utf8.parsed.nested_deep_very_deep": nil, + "utf8.parsed.status": "active", + "utf8.parsed.user_details_age": "30", + "utf8.parsed.user_details_city": "NYC", + "utf8.parsed.user_name": "john", + }, + { + "utf8.parsed.app_config_debug": "true", + "utf8.parsed.app_version": "1.0", + "utf8.parsed.level": "info", + "utf8.parsed.nested_deep_very_deep": nil, + "utf8.parsed.status": nil, + "utf8.parsed.user_details_age": nil, + "utf8.parsed.user_details_city": nil, + "utf8.parsed.user_name": nil, + }, + { + "utf8.parsed.app_config_debug": nil, + "utf8.parsed.app_version": nil, + "utf8.parsed.level": nil, + "utf8.parsed.nested_deep_very_deep": "value", + "utf8.parsed.status": nil, + "utf8.parsed.user_details_age": nil, + "utf8.parsed.user_details_city": nil, + "utf8.parsed.user_name": nil, + }, + }, + }, + { + name: "handle nested JSON with specific requested keys", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"user": {"name": "alice", "profile": {"email": "alice@example.com"}}, "level": "debug"}`}, + {colMsg: `{"user": {"name": "bob"}, "level": "info"}`}, + {colMsg: `{"level": "error", "error": {"code": "500", "message": "internal"}}`}, + }, + requestedKeys: []string{"user_name", "user_profile_email", "level"}, + expectedFields: 3, // level, user_name, user_profile_email + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.level": "debug", + "utf8.parsed.user_name": "alice", + "utf8.parsed.user_profile_email": "alice@example.com", + }, + { + "utf8.parsed.level": "info", + "utf8.parsed.user_name": "bob", + "utf8.parsed.user_profile_email": nil, + }, + { + "utf8.parsed.level": "error", + "utf8.parsed.user_name": nil, + "utf8.parsed.user_profile_email": nil, + }, + }, + }, + { + name: "accept JSON numbers as strings (v1 compatibility)", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"status": 200, "port": 8080, "timeout": 30.5, "retries": 0}`}, + {colMsg: `{"level": "info", "pid": 12345, "memory": 256.8}`}, + {colMsg: `{"score": -1, "version": 2.1, "enabled": true}`}, + }, + requestedKeys: []string{"status", "port", "timeout", "pid", "memory", "score", "version"}, + expectedFields: 7, // memory, pid, port, score, status, timeout, version + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.memory": nil, + "utf8.parsed.pid": nil, + "utf8.parsed.port": "8080", + "utf8.parsed.score": nil, + "utf8.parsed.status": "200", + "utf8.parsed.timeout": "30.5", + "utf8.parsed.version": nil, + }, + { + "utf8.parsed.memory": "256.8", + "utf8.parsed.pid": "12345", + "utf8.parsed.port": nil, + "utf8.parsed.score": nil, + "utf8.parsed.status": nil, + "utf8.parsed.timeout": nil, + "utf8.parsed.version": nil, + }, + { + "utf8.parsed.memory": nil, + "utf8.parsed.pid": nil, + "utf8.parsed.port": nil, + "utf8.parsed.score": "-1", + "utf8.parsed.status": nil, + "utf8.parsed.timeout": nil, + "utf8.parsed.version": "2.1", + }, + }, + }, + { + name: "mixed nested objects and numbers", + schema: arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil), + input: arrowtest.Rows{ + {colMsg: `{"request": {"url": "/api/users", "method": "GET"}, "response": {"status": 200, "time": 45.2}}`}, + {colMsg: `{"user": {"id": 123, "profile": {"age": 25}}, "active": true}`}, + }, + requestedKeys: nil, // Extract all keys + expectedFields: 7, // active, request_method, request_url, response_status, response_time, user_id, user_profile_age + expectedOutput: arrowtest.Rows{ + { + "utf8.parsed.active": nil, + "utf8.parsed.request_method": "GET", + "utf8.parsed.request_url": "/api/users", + "utf8.parsed.response_status": "200", + "utf8.parsed.response_time": "45.2", + "utf8.parsed.user_id": nil, + "utf8.parsed.user_profile_age": nil, + }, + { + "utf8.parsed.active": "true", + "utf8.parsed.request_method": nil, + "utf8.parsed.request_url": nil, + "utf8.parsed.response_status": nil, + "utf8.parsed.response_time": nil, + "utf8.parsed.user_id": "123", + "utf8.parsed.user_profile_age": "25", + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + alloc := memory.DefaultAllocator + + expr := &physical.FunctionExpr{ + Op: types.FunctionOpParseJSON, + Expressions: []physical.Expression{ + &physical.ColumnExpr{ + Ref: semconv.ColumnIdentMessage.ColumnRef(), + }, + &physical.NamedLiteralExpr{ + Name: types.ParseRequestedKeys, + Literal: types.NewLiteral(tt.requestedKeys), + }, + }, + } + e := newExpressionEvaluator() + + record := tt.input.Record(alloc, tt.schema) + colVec, err := e.eval(expr, record) + require.NoError(t, err) + id := colVec.Type().ArrowType().ID() + require.Equal(t, arrow.STRUCT, id) + + arr, ok := colVec.ToArray().(*array.Struct) + require.True(t, ok) + + require.Equal(t, tt.expectedFields, arr.NumField()) //value, error, errorDetails + + // Convert record to rows for comparison + actual, err := structToRows(arr) + require.NoError(t, err) + require.Equal(t, tt.expectedOutput, actual) + }) + } +} + +func TestEvaluateParseExpression_HandleMissingRequestedKeys(t *testing.T) { + colMsg := "utf8.builtin.message" + alloc := memory.DefaultAllocator + + expr := &physical.FunctionExpr{ + Op: types.FunctionOpParseJSON, + Expressions: []physical.Expression{ + &physical.ColumnExpr{ + Ref: semconv.ColumnIdentMessage.ColumnRef(), + }, + }, + } + e := newExpressionEvaluator() + + input := arrowtest.Rows{ + {colMsg: `{"level": "error", "status": "500"}`}, + {colMsg: `{"level": "info", "status": "200"}`}, + {colMsg: `{"level": "debug", "status": "201"}`}, + } + + schema := arrow.NewSchema([]arrow.Field{ + semconv.FieldFromFQN("utf8.builtin.message", true), + }, nil) + + record := input.Record(alloc, schema) + colVec, err := e.eval(expr, record) + require.NoError(t, err) + id := colVec.Type().ArrowType().ID() + require.Equal(t, arrow.STRUCT, id) + + arr, ok := colVec.ToArray().(*array.Struct) + require.True(t, ok) + + require.Equal(t, 2, arr.NumField()) //level, status + + // Convert record to rows for comparison + actual, err := structToRows(arr) + require.NoError(t, err) + + expectedOutput := arrowtest.Rows{ + {"utf8.parsed.level": "error", "utf8.parsed.status": "500"}, + {"utf8.parsed.level": "info", "utf8.parsed.status": "200"}, + {"utf8.parsed.level": "debug", "utf8.parsed.status": "201"}, + } + require.Equal(t, expectedOutput, actual) +} diff --git a/pkg/engine/internal/executor/functions.go b/pkg/engine/internal/executor/functions.go index 1505cb5c52480..59c445b442b10 100644 --- a/pkg/engine/internal/executor/functions.go +++ b/pkg/engine/internal/executor/functions.go @@ -17,6 +17,7 @@ import ( var ( unaryFunctions UnaryFunctionRegistry = &unaryFuncReg{} binaryFunctions BinaryFunctionRegistry = &binaryFuncReg{} + functions FunctionRegistry = &funcReg{} ) func init() { @@ -108,6 +109,10 @@ func init() { unaryFunctions.register(types.UnaryOpCastFloat, arrow.BinaryTypes.String, castFn(types.UnaryOpCastFloat)) unaryFunctions.register(types.UnaryOpCastBytes, arrow.BinaryTypes.String, castFn(types.UnaryOpCastBytes)) unaryFunctions.register(types.UnaryOpCastDuration, arrow.BinaryTypes.String, castFn(types.UnaryOpCastDuration)) + + // Parse functions + functions.register(types.FunctionOpParseLogfmt, parseFn(types.FunctionOpParseLogfmt)) + functions.register(types.FunctionOpParseJSON, parseFn(types.FunctionOpParseJSON)) } type UnaryFunctionRegistry interface { @@ -342,3 +347,41 @@ func boolToInt(b bool) int { } return i } + +type FunctionRegistry interface { + register(types.FunctionOp, Function) + GetForSignature(types.FunctionOp) (Function, error) +} + +type Function interface { + Evaluate(args ...arrow.Array) (arrow.Array, error) +} + +type FunctionFunc func(args ...arrow.Array) (arrow.Array, error) + +func (f FunctionFunc) Evaluate(args ...arrow.Array) (arrow.Array, error) { + return f(args...) +} + +type funcReg struct { + reg map[types.FunctionOp]Function +} + +// register implements UnaryFunctionRegistry. +func (u *funcReg) register(op types.FunctionOp, f Function) { + if u.reg == nil { + u.reg = make(map[types.FunctionOp]Function) + } + // TODO(twhitney): Should the function panic when duplicate keys are registered? + u.reg[op] = f +} + +// GetForSignature implements UnaryFunctionRegistry. +func (u *funcReg) GetForSignature(op types.FunctionOp) (Function, error) { + // Get registered function for the specific operation + fn, ok := u.reg[op] + if !ok { + return nil, errors.ErrNotImplemented + } + return fn, nil +} diff --git a/pkg/engine/internal/executor/parse.go b/pkg/engine/internal/executor/parse.go index 8ce953e76196b..0b55a417a4bbe 100644 --- a/pkg/engine/internal/executor/parse.go +++ b/pkg/engine/internal/executor/parse.go @@ -1,7 +1,6 @@ package executor import ( - "context" "fmt" "sort" "unsafe" @@ -10,48 +9,30 @@ import ( "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/memory" - "github.com/grafana/loki/v3/pkg/engine/internal/planner/physical" "github.com/grafana/loki/v3/pkg/engine/internal/semconv" "github.com/grafana/loki/v3/pkg/engine/internal/types" ) -func NewParsePipeline(parse *physical.ParseNode, input Pipeline) *GenericPipeline { - return newGenericPipeline(func(ctx context.Context, inputs []Pipeline) (arrow.Record, error) { - // Pull the next item from the input pipeline - input := inputs[0] - batch, err := input.Read(ctx) +func parseFn(op types.FunctionOp) Function { + return FunctionFunc(func(args ...ColumnVector) (ColumnVector, error) { + sourceCol, requestedKeys, err := extractParseFnParameters(args) if err != nil { return nil, err } - // Find the message column - msgCol, msgIdx, err := columnForIdent(semconv.ColumnIdentMessage, batch) - if err != nil { - return nil, err - } - - stringCol, ok := msgCol.(*array.String) - if !ok { - return nil, fmt.Errorf("column %s must be of type utf8, got %s", semconv.ColumnIdentMessage.FQN(), batch.Schema().Field(msgIdx)) - } - var headers []string var parsedColumns []arrow.Array - switch parse.Kind { - case physical.ParserLogfmt: - headers, parsedColumns = buildLogfmtColumns(stringCol, parse.RequestedKeys) - case physical.ParserJSON: - headers, parsedColumns = buildJSONColumns(stringCol, parse.RequestedKeys) + switch op { + case types.FunctionOpParseLogfmt: + headers, parsedColumns = buildLogfmtColumns(sourceCol, requestedKeys) + case types.FunctionOpParseJSON: + headers, parsedColumns = buildJSONColumns(sourceCol, requestedKeys) default: - return nil, fmt.Errorf("unsupported parser kind: %v", parse.Kind) + return nil, fmt.Errorf("unsupported parser kind: %v", op) } // Build new schema with original fields plus parsed fields - schema := batch.Schema() - newFields := make([]arrow.Field, 0, schema.NumFields()+len(headers)) - for i := 0; i < schema.NumFields(); i++ { - newFields = append(newFields, schema.Field(i)) - } + newFields := make([]arrow.Field, 0, len(headers)) for _, header := range headers { ct := types.ColumnTypeParsed if header == semconv.ColumnIdentError.ShortName() || header == semconv.ColumnIdentErrorDetails.ShortName() { @@ -60,34 +41,54 @@ func NewParsePipeline(parse *physical.ParseNode, input Pipeline) *GenericPipelin ident := semconv.NewIdentifier(header, ct, types.Loki.String) newFields = append(newFields, semconv.FieldFromIdent(ident, true)) } - newSchema := arrow.NewSchema(newFields, nil) - - // Build new record with all columns - numOriginalCols := int(batch.NumCols()) - numParsedCols := len(parsedColumns) - allColumns := make([]arrow.Array, numOriginalCols+numParsedCols) - // Copy original columns - for i := range numOriginalCols { - col := batch.Column(i) - allColumns[i] = col + result, err := array.NewStructArrayWithFields(parsedColumns, newFields) + if err != nil { + return nil, err } - // Add parsed columns - for i, col := range parsedColumns { - // Defenisve check added for clarity and safety, but BuildLogfmtColumns should already guarantee this - if col.Len() != stringCol.Len() { - return nil, fmt.Errorf("parsed column %d (%s) has %d rows but expected %d", - i, headers[i], col.Len(), stringCol.Len()) - } - allColumns[numOriginalCols+i] = col - } + return &ArrayStruct{ + array: result, + ct: types.ColumnTypeParsed, + rows: int64(sourceCol.Len()), + }, nil + }) +} + +func extractParseFnParameters(args []ColumnVector) (*array.String, []string, error) { + // Valid signatures: + //parse(sourceColVec) + //parse(sourceColVec, requestedKeys) + if len(args) < 1 || len(args) > 2 { + return nil, nil, fmt.Errorf("parse function expected 1 or 2 arguments, got %d", len(args)) + } + + var sourceColVec, requestedKeysColVec ColumnVector + sourceColVec = args[0] + if len(args) == 2 { + requestedKeysColVec = args[1] + } + + if sourceColVec == nil { + return nil, nil, fmt.Errorf("parse function arguments did no include a source ColumnVector to parse") + } + + arr := sourceColVec.ToArray() - // Create the new record - newRecord := array.NewRecord(newSchema, allColumns, int64(stringCol.Len())) + sourceCol, ok := arr.(*array.String) + if !ok { + return nil, nil, fmt.Errorf("parse can only operate on string column types, got %T", arr) + } - return newRecord, nil - }, input) + var requestedKeys []string + if requestedKeysColVec != nil { + reqKeysValue := requestedKeysColVec.Value(0) + requestedKeys, ok = reqKeysValue.([]string) + if !ok { + return nil, nil, fmt.Errorf("expected %s argument to be of type []string, got %T", types.ParseRequestedKeys, reqKeysValue) + } + } + return sourceCol, requestedKeys, nil } // parseFunc represents a function that parses a single line and returns key-value pairs diff --git a/pkg/engine/internal/executor/parse_test.go b/pkg/engine/internal/executor/parse_test.go deleted file mode 100644 index bdde3fc27c167..0000000000000 --- a/pkg/engine/internal/executor/parse_test.go +++ /dev/null @@ -1,730 +0,0 @@ -package executor - -import ( - "testing" - "time" - - "github.com/apache/arrow-go/v18/arrow" - "github.com/stretchr/testify/require" - - "github.com/grafana/loki/v3/pkg/engine/internal/planner/physical" - "github.com/grafana/loki/v3/pkg/engine/internal/semconv" - "github.com/grafana/loki/v3/pkg/engine/internal/types" - "github.com/grafana/loki/v3/pkg/util/arrowtest" -) - -func TestNewParsePipeline_logfmt(t *testing.T) { - var ( - colTs = "timestamp_ns.builtin.timestamp" - colMsg = "utf8.builtin.message" - ) - - for _, tt := range []struct { - name string - schema *arrow.Schema - input arrowtest.Rows - requestedKeys []string - expectedFields int - expectedOutput arrowtest.Rows - }{ - { - name: "parse stage transforms records, adding columns parsed from message", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: "level=error status=500"}, - {colMsg: "level=info status=200"}, - {colMsg: "level=debug status=201"}, - }, - requestedKeys: []string{"level", "status"}, - expectedFields: 3, // 3 columns: message, level, status - expectedOutput: arrowtest.Rows{ - { - colMsg: "level=error status=500", - "utf8.parsed.level": "error", - "utf8.parsed.status": "500", - }, - { - colMsg: "level=info status=200", - "utf8.parsed.level": "info", - "utf8.parsed.status": "200", - }, - { - colMsg: "level=debug status=201", - "utf8.parsed.level": "debug", - "utf8.parsed.status": "201", - }, - }, - }, - { - name: "parse stage preserves existing columns", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("timestamp_ns.builtin.timestamp", true), - semconv.FieldFromFQN("utf8.builtin.message", true), - semconv.FieldFromFQN("utf8.label.app", true), - }, nil), - input: arrowtest.Rows{ - {colTs: time.Unix(1, 0).UTC(), colMsg: "level=error status=500", "utf8.label.app": "frontend"}, - {colTs: time.Unix(2, 0).UTC(), colMsg: "level=info status=200", "utf8.label.app": "backend"}, - }, - requestedKeys: []string{"level", "status"}, - expectedFields: 5, // 5 columns: timestamp, message, app, level, status - expectedOutput: arrowtest.Rows{ - { - colTs: time.Unix(1, 0).UTC(), - colMsg: "level=error status=500", - "utf8.label.app": "frontend", - "utf8.parsed.level": "error", - "utf8.parsed.status": "500", - }, - { - colTs: time.Unix(2, 0).UTC(), - colMsg: "level=info status=200", - "utf8.label.app": "backend", - "utf8.parsed.level": "info", - "utf8.parsed.status": "200", - }, - }, - }, - { - name: "handle missing keys with NULL", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: "level=error"}, - {colMsg: "status=200"}, - {colMsg: "level=info"}, - }, - requestedKeys: []string{"level"}, - expectedFields: 2, // 2 columns: message, level - expectedOutput: arrowtest.Rows{ - { - colMsg: "level=error", - "utf8.parsed.level": "error", - }, - { - colMsg: "status=200", - "utf8.parsed.level": nil, - }, - { - colMsg: "level=info", - "utf8.parsed.level": "info", - }, - }, - }, - { - name: "handle errors with error columns", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: "level=info status=200"}, // No errors - {colMsg: "status==value level=error"}, // Double equals error on requested key - {colMsg: "level=\"unclosed status=500"}, // Unclosed quote error - }, - requestedKeys: []string{"level", "status"}, - expectedFields: 5, // 5 columns: message, level, status, __error__, __error_details__ - expectedOutput: arrowtest.Rows{ - { - colMsg: "level=info status=200", - "utf8.parsed.level": "info", - "utf8.parsed.status": "200", - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - { - colMsg: "status==value level=error", - "utf8.parsed.level": nil, - "utf8.parsed.status": nil, - semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, - semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 8 : unexpected '='", - }, - { - colMsg: "level=\"unclosed status=500", - "utf8.parsed.level": nil, - "utf8.parsed.status": nil, - semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, - semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 27 : unterminated quoted value", - }, - }, - }, - { - name: "extract all keys when none requested", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: "level=info status=200 method=GET"}, - {colMsg: "level=warn code=304"}, - {colMsg: "level=error status=500 method=POST duration=123ms"}, - }, - requestedKeys: nil, // nil means extract all keys - expectedFields: 6, // 6 columns: message, code, duration, level, method, status - expectedOutput: arrowtest.Rows{ - { - colMsg: "level=info status=200 method=GET", - "utf8.parsed.code": nil, - "utf8.parsed.duration": nil, - "utf8.parsed.level": "info", - "utf8.parsed.method": "GET", - "utf8.parsed.status": "200", - }, - { - colMsg: "level=warn code=304", - "utf8.parsed.code": "304", - "utf8.parsed.duration": nil, - "utf8.parsed.level": "warn", - "utf8.parsed.method": nil, - "utf8.parsed.status": nil, - }, - { - colMsg: "level=error status=500 method=POST duration=123ms", - "utf8.parsed.code": nil, - "utf8.parsed.duration": "123ms", - "utf8.parsed.level": "error", - "utf8.parsed.method": "POST", - "utf8.parsed.status": "500", - }, - }, - }, - { - name: "extract all keys with errors when none requested", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: "level=info status=200 method=GET"}, // Valid line - {colMsg: "level==error code=500"}, // Double equals error - {colMsg: "msg=\"unclosed duration=100ms code=400"}, // Unclosed quote error - {colMsg: "level=debug method=POST"}, // Valid line - }, - requestedKeys: nil, // nil means extract all keys - expectedFields: 6, // 6 columns: message, level, method, status, __error__, __error_details__ - expectedOutput: arrowtest.Rows{ - { - colMsg: "level=info status=200 method=GET", - "utf8.parsed.level": "info", - "utf8.parsed.method": "GET", - "utf8.parsed.status": "200", - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - { - colMsg: "level==error code=500", - "utf8.parsed.level": nil, - "utf8.parsed.method": nil, - "utf8.parsed.status": nil, - semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, - semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 7 : unexpected '='", - }, - { - colMsg: "msg=\"unclosed duration=100ms code=400", - "utf8.parsed.level": nil, - "utf8.parsed.method": nil, - "utf8.parsed.status": nil, - semconv.ColumnIdentError.FQN(): types.LogfmtParserErrorType, - semconv.ColumnIdentErrorDetails.FQN(): "logfmt syntax error at pos 38 : unterminated quoted value", - }, - { - colMsg: "level=debug method=POST", - "utf8.parsed.level": "debug", - "utf8.parsed.method": "POST", - "utf8.parsed.status": nil, - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - }, - }, - { - name: "handle duplicate keys with first-wins semantics", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: "level=info status=200 level=debug"}, - }, - requestedKeys: nil, - expectedFields: 3, // 3 columns: message, level, status - expectedOutput: arrowtest.Rows{ - { - colMsg: "level=info status=200 level=debug", - "utf8.parsed.level": "info", - "utf8.parsed.status": "200", - }, - }, - }, - { - name: "parsed keys with invalid characteres are sanitized", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: "level=info status=200 index-store=1234"}, - }, - requestedKeys: nil, - expectedFields: 4, // 4 columns: message, level, status, index-store - expectedOutput: arrowtest.Rows{ - { - colMsg: "level=info status=200 index-store=1234", - "utf8.parsed.level": "info", - "utf8.parsed.status": "200", - "utf8.parsed.index_store": "1234", - }, - }, - }, - } { - t.Run(tt.name, func(t *testing.T) { - // Create input data with message column containing logfmt - input := NewArrowtestPipeline( - tt.schema, - tt.input, - ) - - // Create ParseNode requesting "level" field - parseNode := &physical.ParseNode{ - Kind: physical.ParserLogfmt, - RequestedKeys: tt.requestedKeys, - } - - pipeline := NewParsePipeline(parseNode, input) - - // Read first record - ctx := t.Context() - record, err := pipeline.Read(ctx) - require.NoError(t, err) - - // Verify the output has the expected number of fields - outputSchema := record.Schema() - require.Equal(t, tt.expectedFields, outputSchema.NumFields()) - - // Convert record to rows for comparison - actual, err := arrowtest.RecordRows(record) - require.NoError(t, err) - require.Equal(t, tt.expectedOutput, actual) - }) - } -} - -func TestNewParsePipeline_JSON(t *testing.T) { - var ( - colTs = "timestamp_ns.builtin.timestamp" - colMsg = "utf8.builtin.message" - ) - - for _, tt := range []struct { - name string - schema *arrow.Schema - input arrowtest.Rows - requestedKeys []string - expectedFields int - expectedOutput arrowtest.Rows - }{ - { - name: "parse stage transforms records, adding columns parsed from message", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"level": "error", "status": "500"}`}, - {colMsg: `{"level": "info", "status": "200"}`}, - {colMsg: `{"level": "debug", "status": "201"}`}, - }, - requestedKeys: []string{"level", "status"}, - expectedFields: 3, // 3 columns: message, level, status - expectedOutput: arrowtest.Rows{ - {colMsg: `{"level": "error", "status": "500"}`, "utf8.parsed.level": "error", "utf8.parsed.status": "500"}, - {colMsg: `{"level": "info", "status": "200"}`, "utf8.parsed.level": "info", "utf8.parsed.status": "200"}, - {colMsg: `{"level": "debug", "status": "201"}`, "utf8.parsed.level": "debug", "utf8.parsed.status": "201"}, - }, - }, - { - name: "parse stage preserves existing columns", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("timestamp_ns.builtin.timestamp", true), - semconv.FieldFromFQN("utf8.builtin.message", true), - semconv.FieldFromFQN("utf8.label.app", true), - }, nil), - input: arrowtest.Rows{ - {colTs: time.Unix(1, 0).UTC(), colMsg: `{"level": "error", "status": "500"}`, "utf8.label.app": "frontend"}, - {colTs: time.Unix(2, 0).UTC(), colMsg: `{"level": "info", "status": "200"}`, "utf8.label.app": "backend"}, - }, - requestedKeys: []string{"level", "status"}, - expectedFields: 5, // 5 columns: timestamp, message, app, level, status - expectedOutput: arrowtest.Rows{ - {colTs: time.Unix(1, 0).UTC(), colMsg: `{"level": "error", "status": "500"}`, "utf8.label.app": "frontend", "utf8.parsed.level": "error", "utf8.parsed.status": "500"}, - {colTs: time.Unix(2, 0).UTC(), colMsg: `{"level": "info", "status": "200"}`, "utf8.label.app": "backend", "utf8.parsed.level": "info", "utf8.parsed.status": "200"}, - }, - }, - { - name: "handle missing keys with NULL", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"level": "error"}`}, - {colMsg: `{"status": "200"}`}, - {colMsg: `{"level": "info"}`}, - }, - requestedKeys: []string{"level"}, - expectedFields: 2, // 2 columns: message, level - expectedOutput: arrowtest.Rows{ - {colMsg: `{"level": "error"}`, "utf8.parsed.level": "error"}, - {colMsg: `{"status": "200"}`, "utf8.parsed.level": nil}, - {colMsg: `{"level": "info"}`, "utf8.parsed.level": "info"}, - }, - }, - { - name: "handle errors with error columns", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"level": "info", "status": "200"}`}, // No errors - {colMsg: `{"level": "error", "status":`}, // Missing closing brace and value - {colMsg: `{"level": "info", "status": 200}`}, // Number should be converted to string - }, - requestedKeys: []string{"level", "status"}, - expectedFields: 5, // 5 columns: message, level, status, __error__, __error_details__ (due to malformed JSON) - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"level": "info", "status": "200"}`, - "utf8.parsed.level": "info", - "utf8.parsed.status": "200", - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - { - colMsg: `{"level": "error", "status":`, - "utf8.parsed.level": nil, - "utf8.parsed.status": nil, - semconv.ColumnIdentError.FQN(): "JSONParserErr", - semconv.ColumnIdentErrorDetails.FQN(): "Malformed JSON error", - }, - { - colMsg: `{"level": "info", "status": 200}`, - "utf8.parsed.level": "info", - "utf8.parsed.status": "200", - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - }, - }, - { - name: "extract all keys when none requested", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"level": "info", "status": "200", "method": "GET"}`}, - {colMsg: `{"level": "warn", "code": "304"}`}, - {colMsg: `{"level": "error", "status": "500", "method": "POST", "duration": "123ms"}`}, - }, - requestedKeys: nil, // nil means extract all keys - expectedFields: 6, // 6 columns: message, code, duration, level, method, status - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"level": "info", "status": "200", "method": "GET"}`, - "utf8.parsed.code": nil, - "utf8.parsed.duration": nil, - "utf8.parsed.level": "info", - "utf8.parsed.method": "GET", - "utf8.parsed.status": "200", - }, - { - colMsg: `{"level": "warn", "code": "304"}`, - "utf8.parsed.code": "304", - "utf8.parsed.duration": nil, - "utf8.parsed.level": "warn", - "utf8.parsed.method": nil, - "utf8.parsed.status": nil, - }, - { - colMsg: `{"level": "error", "status": "500", "method": "POST", "duration": "123ms"}`, - "utf8.parsed.code": nil, - "utf8.parsed.duration": "123ms", - "utf8.parsed.level": "error", - "utf8.parsed.method": "POST", - "utf8.parsed.status": "500", - }, - }, - }, - { - name: "extract all keys with errors when none requested", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"level": "info", "status": "200", "method": "GET"}`}, // Valid line - {colMsg: `{"level": "error", "code": 500}`}, // Also valid, adds code column - {colMsg: `{"msg": "unclosed}`}, // Unclosed quote - {colMsg: `{"level": "debug", "method": "POST"}`}, // Valid line - }, - requestedKeys: nil, // nil means extract all keys - expectedFields: 7, // 7 columns: message, level, method, status, code, __error__, __error_details__ - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"level": "info", "status": "200", "method": "GET"}`, - "utf8.parsed.level": "info", - "utf8.parsed.method": "GET", - "utf8.parsed.status": "200", - "utf8.parsed.code": nil, - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - { - colMsg: `{"level": "error", "code": 500}`, - "utf8.parsed.level": "error", - "utf8.parsed.method": nil, - "utf8.parsed.status": nil, - "utf8.parsed.code": "500", - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - { - colMsg: `{"msg": "unclosed}`, - "utf8.parsed.level": nil, - "utf8.parsed.method": nil, - "utf8.parsed.status": nil, - "utf8.parsed.code": nil, - semconv.ColumnIdentError.FQN(): "JSONParserErr", - semconv.ColumnIdentErrorDetails.FQN(): "Value is string, but can't find closing '\"' symbol", - }, - { - colMsg: `{"level": "debug", "method": "POST"}`, - "utf8.parsed.level": "debug", - "utf8.parsed.method": "POST", - "utf8.parsed.status": nil, - "utf8.parsed.code": nil, - semconv.ColumnIdentError.FQN(): nil, - semconv.ColumnIdentErrorDetails.FQN(): nil, - }, - }, - }, - { - name: "handle nested JSON objects with underscore flattening", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"user": {"name": "john", "details": {"age": "30", "city": "NYC"}}, "status": "active"}`}, - {colMsg: `{"app": {"version": "1.0", "config": {"debug": "true"}}, "level": "info"}`}, - {colMsg: `{"nested": {"deep": {"very": {"deep": "value"}}}}`}, - }, - requestedKeys: nil, // Extract all keys including nested ones - expectedFields: 9, // message, app_config_debug, app_version, level, nested_deep_very_deep, status, user_details_age, user_details_city, user_name - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"user": {"name": "john", "details": {"age": "30", "city": "NYC"}}, "status": "active"}`, - "utf8.parsed.app_config_debug": nil, - "utf8.parsed.app_version": nil, - "utf8.parsed.level": nil, - "utf8.parsed.nested_deep_very_deep": nil, - "utf8.parsed.status": "active", - "utf8.parsed.user_details_age": "30", - "utf8.parsed.user_details_city": "NYC", - "utf8.parsed.user_name": "john", - }, - { - colMsg: `{"app": {"version": "1.0", "config": {"debug": "true"}}, "level": "info"}`, - "utf8.parsed.app_config_debug": "true", - "utf8.parsed.app_version": "1.0", - "utf8.parsed.level": "info", - "utf8.parsed.nested_deep_very_deep": nil, - "utf8.parsed.status": nil, - "utf8.parsed.user_details_age": nil, - "utf8.parsed.user_details_city": nil, - "utf8.parsed.user_name": nil, - }, - { - colMsg: `{"nested": {"deep": {"very": {"deep": "value"}}}}`, - "utf8.parsed.app_config_debug": nil, - "utf8.parsed.app_version": nil, - "utf8.parsed.level": nil, - "utf8.parsed.nested_deep_very_deep": "value", - "utf8.parsed.status": nil, - "utf8.parsed.user_details_age": nil, - "utf8.parsed.user_details_city": nil, - "utf8.parsed.user_name": nil, - }, - }, - }, - { - name: "handle nested JSON with specific requested keys", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"user": {"name": "alice", "profile": {"email": "alice@example.com"}}, "level": "debug"}`}, - {colMsg: `{"user": {"name": "bob"}, "level": "info"}`}, - {colMsg: `{"level": "error", "error": {"code": "500", "message": "internal"}}`}, - }, - requestedKeys: []string{"user_name", "user_profile_email", "level"}, - expectedFields: 4, // message, level, user_name, user_profile_email - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"user": {"name": "alice", "profile": {"email": "alice@example.com"}}, "level": "debug"}`, - "utf8.parsed.level": "debug", - "utf8.parsed.user_name": "alice", - "utf8.parsed.user_profile_email": "alice@example.com", - }, - { - colMsg: `{"user": {"name": "bob"}, "level": "info"}`, - "utf8.parsed.level": "info", - "utf8.parsed.user_name": "bob", - "utf8.parsed.user_profile_email": nil, - }, - { - colMsg: `{"level": "error", "error": {"code": "500", "message": "internal"}}`, - "utf8.parsed.level": "error", - "utf8.parsed.user_name": nil, - "utf8.parsed.user_profile_email": nil, - }, - }, - }, - { - name: "accept JSON numbers as strings (v1 compatibility)", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"status": 200, "port": 8080, "timeout": 30.5, "retries": 0}`}, - {colMsg: `{"level": "info", "pid": 12345, "memory": 256.8}`}, - {colMsg: `{"score": -1, "version": 2.1, "enabled": true}`}, - }, - requestedKeys: []string{"status", "port", "timeout", "pid", "memory", "score", "version"}, - expectedFields: 8, // message, memory, pid, port, score, status, timeout, version - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"status": 200, "port": 8080, "timeout": 30.5, "retries": 0}`, - "utf8.parsed.memory": nil, - "utf8.parsed.pid": nil, - "utf8.parsed.port": "8080", - "utf8.parsed.score": nil, - "utf8.parsed.status": "200", - "utf8.parsed.timeout": "30.5", - "utf8.parsed.version": nil, - }, - { - colMsg: `{"level": "info", "pid": 12345, "memory": 256.8}`, - "utf8.parsed.memory": "256.8", - "utf8.parsed.pid": "12345", - "utf8.parsed.port": nil, - "utf8.parsed.score": nil, - "utf8.parsed.status": nil, - "utf8.parsed.timeout": nil, - "utf8.parsed.version": nil, - }, - { - colMsg: `{"score": -1, "version": 2.1, "enabled": true}`, - "utf8.parsed.memory": nil, - "utf8.parsed.pid": nil, - "utf8.parsed.port": nil, - "utf8.parsed.score": "-1", - "utf8.parsed.status": nil, - "utf8.parsed.timeout": nil, - "utf8.parsed.version": "2.1", - }, - }, - }, - { - name: "mixed nested objects and numbers", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"request": {"url": "/api/users", "method": "GET"}, "response": {"status": 200, "time": 45.2}}`}, - {colMsg: `{"user": {"id": 123, "profile": {"age": 25}}, "active": true}`}, - }, - requestedKeys: nil, // Extract all keys - expectedFields: 8, // message, active, request_method, request_url, response_status, response_time, user_id, user_profile_age - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"request": {"url": "/api/users", "method": "GET"}, "response": {"status": 200, "time": 45.2}}`, - "utf8.parsed.active": nil, - "utf8.parsed.request_method": "GET", - "utf8.parsed.request_url": "/api/users", - "utf8.parsed.response_status": "200", - "utf8.parsed.response_time": "45.2", - "utf8.parsed.user_id": nil, - "utf8.parsed.user_profile_age": nil, - }, - { - colMsg: `{"user": {"id": 123, "profile": {"age": 25}}, "active": true}`, - "utf8.parsed.active": "true", - "utf8.parsed.request_method": nil, - "utf8.parsed.request_url": nil, - "utf8.parsed.response_status": nil, - "utf8.parsed.response_time": nil, - "utf8.parsed.user_id": "123", - "utf8.parsed.user_profile_age": "25", - }, - }, - }, - { - name: "duplicate field name takes first value", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"app": "foo", "app": "duplicate"}`}, - }, - requestedKeys: nil, // Extract all keys - expectedFields: 2, // message, app - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"app": "foo", "app": "duplicate"}`, - "utf8.parsed.app": "foo", - }, - }, - }, - { - name: "parsed keys with invalid characteres are sanitized", - schema: arrow.NewSchema([]arrow.Field{ - semconv.FieldFromFQN("utf8.builtin.message", true), - }, nil), - input: arrowtest.Rows{ - {colMsg: `{"index-store": "foo"}`}, - }, - requestedKeys: nil, - expectedFields: 2, // 2 columns: message, index-store - expectedOutput: arrowtest.Rows{ - { - colMsg: `{"index-store": "foo"}`, - "utf8.parsed.index_store": "foo", - }, - }, - }, - } { - t.Run(tt.name, func(t *testing.T) { - // Create input data with message column containing JSON - input := NewArrowtestPipeline( - tt.schema, - tt.input, - ) - - // Create ParseNode for JSON parsing - parseNode := &physical.ParseNode{ - Kind: physical.ParserJSON, - RequestedKeys: tt.requestedKeys, - } - - pipeline := NewParsePipeline(parseNode, input) - - // Read first record - ctx := t.Context() - record, err := pipeline.Read(ctx) - require.NoError(t, err) - - // Verify the output has the expected number of fields - outputSchema := record.Schema() - require.Equal(t, tt.expectedFields, outputSchema.NumFields()) - - // Convert record to rows for comparison - actual, err := arrowtest.RecordRows(record) - require.NoError(t, err) - require.Equal(t, tt.expectedOutput, actual) - }) - } -} diff --git a/pkg/engine/internal/planner/logical/builder.go b/pkg/engine/internal/planner/logical/builder.go index ec7269a260e0b..eba6ad8a47eef 100644 --- a/pkg/engine/internal/planner/logical/builder.go +++ b/pkg/engine/internal/planner/logical/builder.go @@ -3,6 +3,7 @@ package logical import ( "time" + "github.com/grafana/loki/v3/pkg/engine/internal/semconv" "github.com/grafana/loki/v3/pkg/engine/internal/types" ) @@ -39,35 +40,31 @@ func (b *Builder) Limit(skip uint32, fetch uint32) *Builder { } // Parse applies a [Parse] operation to the Builder. -func (b *Builder) Parse(kind ParserKind) *Builder { - return &Builder{ - val: &Parse{ - Table: b.val, - Kind: kind, +func (b *Builder) Parse(op types.FunctionOp) *Builder { + val := &FunctionOp{ + Op: op, + Values: []Value{ + // source column + &ColumnRef{ + Ref: semconv.ColumnIdentMessage.ColumnRef(), + }, }, } + return b.ProjectExpand(val) } // Cast applies an [Projection] operation, with an [UnaryOp] cast operation, to the Builder. -func (b *Builder) Cast(identifier string, operation types.UnaryOp) *Builder { - return &Builder{ - val: &Projection{ - Relation: b.val, - Expressions: []Value{ - &UnaryOp{ - Op: operation, - Value: &ColumnRef{ - Ref: types.ColumnRef{ - Column: identifier, - Type: types.ColumnTypeAmbiguous, - }, - }, - }, +func (b *Builder) Cast(identifier string, op types.UnaryOp) *Builder { + val := &UnaryOp{ + Op: op, + Value: &ColumnRef{ + Ref: types.ColumnRef{ + Column: identifier, + Type: types.ColumnTypeAmbiguous, }, - All: true, - Expand: true, }, } + return b.ProjectExpand(val) } // Sort applies a [Sort] operation to the Builder. diff --git a/pkg/engine/internal/planner/logical/builder_convert.go b/pkg/engine/internal/planner/logical/builder_convert.go index 3994f175b69ec..122274df142d7 100644 --- a/pkg/engine/internal/planner/logical/builder_convert.go +++ b/pkg/engine/internal/planner/logical/builder_convert.go @@ -47,10 +47,6 @@ func (b *ssaBuilder) process(value Value) (Value, error) { return b.processRangeAggregate(value) case *VectorAggregation: return b.processVectorAggregation(value) - case *Parse: - return b.processParsePlan(value) - case *UnaryOp: - return b.processUnaryOp(value) case *BinOp: return b.processBinOp(value) case *ColumnRef: @@ -152,19 +148,6 @@ func (b *ssaBuilder) processSortPlan(plan *Sort) (Value, error) { return plan, nil } -func (b *ssaBuilder) processParsePlan(plan *Parse) (Value, error) { - if _, err := b.process(plan.Table); err != nil { - return nil, err - } - - // Only append the first time we see this. - if plan.id == "" { - plan.id = fmt.Sprintf("%%%d", b.getID()) - b.instructions = append(b.instructions, plan) - } - return plan, nil -} - func (b *ssaBuilder) processUnaryOp(value *UnaryOp) (Value, error) { if _, err := b.process(value.Value); err != nil { return nil, err diff --git a/pkg/engine/internal/planner/logical/function_op.go b/pkg/engine/internal/planner/logical/function_op.go new file mode 100644 index 0000000000000..2c399aaae1c50 --- /dev/null +++ b/pkg/engine/internal/planner/logical/function_op.go @@ -0,0 +1,42 @@ +package logical + +import ( + "fmt" + "strings" + + "github.com/grafana/loki/v3/pkg/engine/internal/types" +) + +// The FunctionOp instruction yields the result of function operation Op Value. +// UnaryOp implements both [Instruction] and [Value]. +type FunctionOp struct { + id string + + Op types.FunctionOp + Values []Value +} + +var ( + _ Value = (*UnaryOp)(nil) + _ Instruction = (*UnaryOp)(nil) +) + +// Name returns an identifier for the UnaryOp operation. +func (u *FunctionOp) Name() string { + if u.id != "" { + return u.id + } + return fmt.Sprintf("%p", u) +} + +// String returns the disassembled SSA form of the FunctionOp instruction. +func (u *FunctionOp) String() string { + values := make([]string, len(u.Values)) + for i, v := range u.Values { + values[i] = v.String() + } + return fmt.Sprintf("%s(%s)", u.Op, strings.Join(values, ", ")) +} + +func (u *FunctionOp) isValue() {} +func (u *FunctionOp) isInstruction() {} diff --git a/pkg/engine/internal/planner/logical/node_parse.go b/pkg/engine/internal/planner/logical/node_parse.go deleted file mode 100644 index 835802a863993..0000000000000 --- a/pkg/engine/internal/planner/logical/node_parse.go +++ /dev/null @@ -1,54 +0,0 @@ -package logical - -import ( - "fmt" -) - -// ParserKind represents the type of parser to use -type ParserKind int - -const ( - ParserInvalid ParserKind = iota - ParserLogfmt - ParserJSON -) - -func (p ParserKind) String() string { - switch p { - case ParserLogfmt: - return "logfmt" - case ParserJSON: - return "json" - default: - return "invalid" - } -} - -// Parse represents a parsing instruction that extracts fields from log lines. -// It takes a table relation as input and produces a new table relation with -// additional columns for the parsed fields. -type Parse struct { - id string - - Table Value // The table relation to parse from - Kind ParserKind -} - -// Name returns an identifier for the Parse operation. -func (p *Parse) Name() string { - if p.id != "" { - return p.id - } - return fmt.Sprintf("%p", p) -} - -// String returns the string representation of the Parse instruction -func (p *Parse) String() string { - return fmt.Sprintf("PARSE %s [kind=%v]", p.Table.Name(), p.Kind) -} - -// isInstruction marks Parse as an Instruction -func (p *Parse) isInstruction() {} - -// isValue marks Parse as a Value -func (p *Parse) isValue() {} diff --git a/pkg/engine/internal/planner/logical/planner.go b/pkg/engine/internal/planner/logical/planner.go index a55cd9d289dd0..11bb583187f1c 100644 --- a/pkg/engine/internal/planner/logical/planner.go +++ b/pkg/engine/internal/planner/logical/planner.go @@ -188,12 +188,12 @@ func buildPlanForLogQuery( // TODO: there's a subtle bug here, as it is actually possible to have both a logfmt parser and a json parser // for example, the query `{app="foo"} | json | line_format "{{.nested_json}}" | json ` is valid, and will need - // multiple parse stages. We will handle thid in a future PR. + // multiple parse stages. We will handle this in a future PR. if hasLogfmtParser { - builder = builder.Parse(ParserLogfmt) + builder = builder.Parse(types.FunctionOpParseLogfmt) } if hasJSONParser { - builder = builder.Parse(ParserJSON) + builder = builder.Parse(types.FunctionOpParseJSON) } for _, value := range postParsePredicates { builder = builder.Select(value) diff --git a/pkg/engine/internal/planner/logical/planner_test.go b/pkg/engine/internal/planner/logical/planner_test.go index 8e54c2aaf190f..6d6baf7dd595e 100644 --- a/pkg/engine/internal/planner/logical/planner_test.go +++ b/pkg/engine/internal/planner/logical/planner_test.go @@ -415,8 +415,8 @@ RETURN %10 }) } -func TestPlannerCreatesParse(t *testing.T) { - t.Run("creates Parse instruction for metric query with logfmt", func(t *testing.T) { +func TestPlannerCreatesProjectionWithParseOperation(t *testing.T) { + t.Run("creates projection instruction with logfmt parse operation for metric query", func(t *testing.T) { // Query with logfmt parser followed by label filter in an instant metric query q := &query{ statement: `sum by (level) (count_over_time({app="test"} | logfmt | level="error" [5m]))`, @@ -437,7 +437,7 @@ func TestPlannerCreatesParse(t *testing.T) { %4 = SELECT %2 [predicate=%3] %5 = LT builtin.timestamp 1970-01-01T02:00:00Z %6 = SELECT %4 [predicate=%5] -%7 = PARSE %6 [kind=logfmt] +%7 = PROJECT %6 [mode=*E, expr=PARSE_LOGFMT(builtin.message)] %8 = EQ ambiguous.level "error" %9 = SELECT %7 [predicate=%8] %10 = RANGE_AGGREGATION %9 [operation=count, start_ts=1970-01-01T01:00:00Z, end_ts=1970-01-01T02:00:00Z, step=0s, range=5m0s] @@ -448,7 +448,7 @@ RETURN %12 require.Equal(t, expected, plan.String()) }) - t.Run("creates Parse instruction for log query with logfmt", func(t *testing.T) { + t.Run("creates projection instruction with logfmt parse operation for log query", func(t *testing.T) { q := &query{ statement: `{app="test"} | logfmt | level="error"`, start: 3600, @@ -468,7 +468,7 @@ RETURN %12 %4 = SELECT %2 [predicate=%3] %5 = LT builtin.timestamp 1970-01-01T02:00:00Z %6 = SELECT %4 [predicate=%5] -%7 = PARSE %6 [kind=logfmt] +%7 = PROJECT %6 [mode=*E, expr=PARSE_LOGFMT(builtin.message)] %8 = EQ ambiguous.level "error" %9 = SELECT %7 [predicate=%8] %10 = SORT %9 [column=builtin.timestamp, asc=false, nulls_first=false] @@ -479,7 +479,7 @@ RETURN %12 require.Equal(t, expected, plan.String()) }) - t.Run("creates Parse instruction for metric query with json", func(t *testing.T) { + t.Run("creates projection instruction with json parse operation for metric query", func(t *testing.T) { // Query with logfmt parser followed by label filter in an instant metric query q := &query{ statement: `sum by (level) (count_over_time({app="test"} | json | level="error" [5m]))`, @@ -499,7 +499,7 @@ RETURN %12 %4 = SELECT %2 [predicate=%3] %5 = LT builtin.timestamp 1970-01-01T02:00:00Z %6 = SELECT %4 [predicate=%5] -%7 = PARSE %6 [kind=json] +%7 = PROJECT %6 [mode=*E, expr=PARSE_JSON(builtin.message)] %8 = EQ ambiguous.level "error" %9 = SELECT %7 [predicate=%8] %10 = RANGE_AGGREGATION %9 [operation=count, start_ts=1970-01-01T01:00:00Z, end_ts=1970-01-01T02:00:00Z, step=0s, range=5m0s] @@ -510,7 +510,7 @@ RETURN %12 require.Equal(t, expected, plan.String()) }) - t.Run("creates Parse instruction for log query with json", func(t *testing.T) { + t.Run("creates projection instruction with json parse operation for log query", func(t *testing.T) { q := &query{ statement: `{app="test"} | json | level="error"`, start: 3600, @@ -529,7 +529,7 @@ RETURN %12 %4 = SELECT %2 [predicate=%3] %5 = LT builtin.timestamp 1970-01-01T02:00:00Z %6 = SELECT %4 [predicate=%5] -%7 = PARSE %6 [kind=json] +%7 = PROJECT %6 [mode=*E, expr=PARSE_JSON(builtin.message)] %8 = EQ ambiguous.level "error" %9 = SELECT %7 [predicate=%8] %10 = SORT %9 [column=builtin.timestamp, asc=false, nulls_first=false] @@ -540,7 +540,7 @@ RETURN %12 require.Equal(t, expected, plan.String()) }) - t.Run("preserves operation order with filters before and after parse", func(t *testing.T) { + t.Run("preserves operation order with filters before and after projection with parse operation", func(t *testing.T) { // Test that filters before logfmt parse are applied before parsing, // and filters after logfmt parse are applied after parsing. // This is important for performance - we don't want to parse lines @@ -568,7 +568,7 @@ RETURN %12 %8 = SELECT %6 [predicate=%7] %9 = SELECT %8 [predicate=%2] %10 = SELECT %9 [predicate=%3] -%11 = PARSE %10 [kind=logfmt] +%11 = PROJECT %10 [mode=*E, expr=PARSE_LOGFMT(builtin.message)] %12 = EQ ambiguous.level "debug" %13 = SELECT %11 [predicate=%12] %14 = SORT %13 [column=builtin.timestamp, asc=false, nulls_first=false] @@ -580,7 +580,7 @@ RETURN %16 require.Equal(t, expected, plan.String(), "Operations should be in the correct order: LineFilter before Parse, LabelFilter after Parse") }) - t.Run("preserves operation order in metric query with filters before and after parse", func(t *testing.T) { + t.Run("preserves operation order in metric query with filters before and after projection with parse operation", func(t *testing.T) { // Test that filters before logfmt parse are applied before parsing in metric queries too q := &query{ statement: `sum by (level) (count_over_time({job="app"} |= "error" | label="value" | logfmt | level="debug" [5m]))`, @@ -605,7 +605,7 @@ RETURN %16 %8 = SELECT %6 [predicate=%7] %9 = SELECT %8 [predicate=%2] %10 = SELECT %9 [predicate=%3] -%11 = PARSE %10 [kind=logfmt] +%11 = PROJECT %10 [mode=*E, expr=PARSE_LOGFMT(builtin.message)] %12 = EQ ambiguous.level "debug" %13 = SELECT %11 [predicate=%12] %14 = RANGE_AGGREGATION %13 [operation=count, start_ts=1970-01-01T01:00:00Z, end_ts=1970-01-01T02:00:00Z, step=0s, range=5m0s] diff --git a/pkg/engine/internal/planner/physical/expressions.go b/pkg/engine/internal/planner/physical/expressions.go index 2a83984965ea4..dff385f00dc58 100644 --- a/pkg/engine/internal/planner/physical/expressions.go +++ b/pkg/engine/internal/planner/physical/expressions.go @@ -2,6 +2,7 @@ package physical import ( "fmt" + "strings" "github.com/grafana/loki/v3/pkg/engine/internal/types" ) @@ -14,6 +15,7 @@ const ( ExprTypeUnary ExprTypeBinary + ExprTypeFunction ExprTypeLiteral ExprTypeColumn ) @@ -25,6 +27,8 @@ func (t ExpressionType) String() string { return "UnaryExpression" case ExprTypeBinary: return "BinaryExpression" + case ExprTypeFunction: + return "FunctionExpression" case ExprTypeLiteral: return "LiteralExpression" case ExprTypeColumn: @@ -64,6 +68,13 @@ type BinaryExpression interface { isBinaryExpr() } +// FunctionExpression is the common interface for all function expressions in a +// physical plan. +type FunctionExpression interface { + Expression + isFunctionExpr() +} + // LiteralExpression is the common interface for all literal expressions in a // physical plan. type LiteralExpression interface { @@ -102,7 +113,7 @@ func (e *UnaryExpr) String() string { return fmt.Sprintf("%s(%s)", e.Op, e.Left) } -// ID returns the type of the [UnaryExpr]. +// Type returns the type of the [UnaryExpr]. func (*UnaryExpr) Type() ExpressionType { return ExprTypeUnary } @@ -129,7 +140,7 @@ func (e *BinaryExpr) String() string { return fmt.Sprintf("%s(%s, %s)", e.Op, e.Left, e.Right) } -// ID returns the type of the [BinaryExpr]. +// Type returns the type of the [BinaryExpr]. func (*BinaryExpr) Type() ExpressionType { return ExprTypeBinary } @@ -153,7 +164,7 @@ func (e *LiteralExpr) String() string { return e.Literal.String() } -// ID returns the type of the [LiteralExpr]. +// Type returns the type of the [LiteralExpr]. func (*LiteralExpr) Type() ExpressionType { return ExprTypeLiteral } @@ -198,7 +209,74 @@ func (e *ColumnExpr) String() string { return e.Ref.String() } -// ID returns the type of the [ColumnExpr]. +// Type returns the type of the [ColumnExpr]. func (e *ColumnExpr) Type() ExpressionType { return ExprTypeColumn } + +// FunctionExpr is an expression that implements the [FunctionExpression] interface. +type FunctionExpr struct { + // Op is the function operation to apply to the parameters + Op types.FunctionOp + + // Expressions are the parameters paaws to the function + Expressions []Expression +} + +func (*FunctionExpr) isExpr() {} +func (*FunctionExpr) isFunctionExpr() {} + +// Clone returns a copy of the [FunctionExpr]. +func (e *FunctionExpr) Clone() Expression { + params := make([]Expression, len(e.Expressions)) + for i, param := range e.Expressions { + params[i] = param.Clone() + } + + return &FunctionExpr{ + Expressions: params, + Op: e.Op, + } +} + +func (e *FunctionExpr) String() string { + exprs := make([]string, len(e.Expressions)) + for i, expr := range e.Expressions { + exprs[i] = expr.String() + } + return fmt.Sprintf("%s(%s)", e.Op, strings.Join(exprs, ", ")) +} + +// Type returns the type of the [FunctionExpr]. +func (*FunctionExpr) Type() ExpressionType { + return ExprTypeFunction +} + +type NamedLiteralExpr struct { + types.Literal + Name string +} + +func (*NamedLiteralExpr) isExpr() {} +func (*NamedLiteralExpr) isLiteralExpr() {} + +// Clone returns a copy of the [NamedLiteralExpr]. +func (e *NamedLiteralExpr) Clone() Expression { + // No need to clone literals. + return &NamedLiteralExpr{Literal: e.Literal, Name: e.Name} +} + +// String returns the string representation of the literal value. +func (e *NamedLiteralExpr) String() string { + return fmt.Sprintf("%s: %s", e.Name, e.Literal.String()) +} + +// Type returns the type of the [FuntionLiteralParameterExpr]. +func (*NamedLiteralExpr) Type() ExpressionType { + return ExprTypeLiteral +} + +// ValueType returns the kind of value represented by the literal. +func (e *NamedLiteralExpr) ValueType() types.DataType { + return e.Literal.Type() +} diff --git a/pkg/engine/internal/planner/physical/optimizer.go b/pkg/engine/internal/planner/physical/optimizer.go index bf71a6288ddc3..6bcf33802940d 100644 --- a/pkg/engine/internal/planner/physical/optimizer.go +++ b/pkg/engine/internal/planner/physical/optimizer.go @@ -268,13 +268,13 @@ func (r *projectionPushdown) propagateProjections(node Node, projections []Colum extracted := extractColumnsFromPredicates(node.Predicates) projections = append(projections, extracted...) - case *ParseNode: - // ParseNode is a special case. It is both a target for projections and a source of projections. - // [Target] Ambiguous columns are applied as requested keys to ParseNode. - // [Source] Appends builtin message column. - var parseNodeChanged bool - parseNodeChanged, projections = r.handleParseNode(node, projections) - if parseNodeChanged { + case *Proejction: + // Projections are a special case. It is both a target for and a source of projections. + // [Target] Operations may take columns as arguments, such as requested keys for parse.. + // [Source] Operations may contain columns to append, such as builtin message column for parse. + var projectionNodeChanged bool + projectionNodeChanged, projections = r.handleProjection(node, projections) + if projectionNodeChanged { changed = true } @@ -371,13 +371,52 @@ func (r *projectionPushdown) handleDataobjScan(node *DataObjScan, projections [] return changed } -// handleParseNode handles projection pushdown for ParseNode nodes -func (r *projectionPushdown) handleParseNode(node *ParseNode, projections []ColumnExpression) (bool, []ColumnExpression) { +// handleProjection handles projection pushdown for expressions on Projection nodes +func (r *projectionPushdown) handleProjection(node *Projection, projections []ColumnExpression, applyIfNotEmpty bool) (bool, []ColumnExpression) { + anyChanged := false + for _, e := range node.Expressions { + switch e := e.(type) { + case *FunctionExpr: + if e.Op == types.FunctionOpParseJSON || e.Op == types.FunctionOpParseLogfmt { + return r.handleParse(e, projections, applyIfNotEmpty) + } + } + } + + return anyChanged +} + +func (r *projectionPushdown) handleParse(expr *FunctionExpr, projections []ColumnExpression, applyIfNotEmpty bool) (bool, []ColumnExpression) { _, ambiguousProjections := disambiguateColumns(projections) + var requestedKeysExpr *NamedLiteralExpr + for _, e := range expr.Expressions { + switch e := e.(type) { + case *NamedLiteralExpr: + if e.Name != types.ParseRequestedKeys { + continue + } + + requestedKeysExpr = e + } + } + + if requestedKeysExpr == nil { + requestedKeysExpr = &NamedLiteralExpr{ + Name: types.ParseRequestedKeys, + Literal: types.NewLiteral([]string{}), + } + expr.Expressions = append(expr.Expressions, requestedKeysExpr) + } + + existingKeys, ok := requestedKeysExpr.Literal.(types.StringListLiteral) + if !ok { + return nil, false + } + // Found a ParseNode - update its keys requestedKeys := make(map[string]bool) - for _, k := range node.RequestedKeys { + for _, k := range existingKeys { requestedKeys[k] = true } @@ -393,12 +432,12 @@ func (r *projectionPushdown) handleParseNode(node *ParseNode, projections []Colu } } - changed := len(requestedKeys) > len(node.RequestedKeys) + changed := len(requestedKeys) > len(existingKeys) if changed { // Convert back to sorted slice newKeys := slices.Collect(maps.Keys(requestedKeys)) sort.Strings(newKeys) - node.RequestedKeys = newKeys + requestedKeysExpr.Literal = types.NewLiteral(newKeys) } projections = append(projections, &ColumnExpr{ @@ -491,7 +530,7 @@ func (p *parallelPushdown) applyParallelization(node Node) bool { // There can be additional special cases, such as parallelizing an `avg` by // pushing down a `sum` and `count` into the Parallelize. switch node.(type) { - case *Projection, *Filter, *ParseNode, *ColumnCompat: // Catchall for shifting nodes + case *Projection, *Filter, *ColumnCompat: // Catchall for shifting nodes for _, parallelize := range p.plan.Children(node) { p.plan.graph.Inject(parallelize, node.Clone()) } diff --git a/pkg/engine/internal/planner/physical/optimizer_test.go b/pkg/engine/internal/planner/physical/optimizer_test.go index 9db0455446875..8a518be1c8b36 100644 --- a/pkg/engine/internal/planner/physical/optimizer_test.go +++ b/pkg/engine/internal/planner/physical/optimizer_test.go @@ -599,7 +599,7 @@ func TestProjectionPushdown(t *testing.T) { }) } -func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { +func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { tests := []struct { name string buildLogical func() logical.Value @@ -607,7 +607,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { expectedDataObjScanProjections []string }{ { - name: "ParseNode remains empty when no operations need parsed fields", + name: "requested keys remain empty when no operations need parsed fields", buildLogical: func() logical.Value { // Create a simple log query with no filters that need parsed fields // {app="test"} | logfmt @@ -623,41 +623,12 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { }) // Add parse but no filters requiring parsed fields - builder = builder.Parse(logical.ParserLogfmt) + builder = builder.Parse(types.FunctionOpParseLogfmt) return builder.Value() }, }, { - name: "ParseNode extracts all keys for log queries", - buildLogical: func() logical.Value { - // Create a logical plan that represents: - // {app="test"} | logfmt | level="error" - // This is a log query (no RangeAggregation) so should parse all keys - builder := logical.NewBuilder(&logical.MakeTable{ - Selector: &logical.BinOp{ - Left: logical.NewColumnRef("app", types.ColumnTypeLabel), - Right: logical.NewLiteral("test"), - Op: types.BinaryOpEq, - }, - Shard: logical.NewShard(0, 1), // noShard - }) - - // Don't set RequestedKeys here - optimization should determine them - builder = builder.Parse(logical.ParserLogfmt) - - // Add filter with ambiguous column - filterExpr := &logical.BinOp{ - Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), - Right: logical.NewLiteral("error"), - Op: types.BinaryOpEq, - } - builder = builder.Select(filterExpr) - return builder.Value() - }, - expectedParseKeysRequested: nil, // Log queries should parse all keys - }, - { - name: "ParseNode skips label and builtin columns, only collects ambiguous", + name: "skips label and builtin columns, only collects ambiguous", buildLogical: func() logical.Value { // {app="test"} | logfmt | app="frontend" | level="error" // This is a log query (no RangeAggregation) so should parse all keys @@ -670,7 +641,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { Shard: logical.NewShard(0, 1), }) - builder = builder.Parse(logical.ParserLogfmt) + builder = builder.Parse(types.FunctionOpParseLogfmt) // Add filter on label column (should be skipped) labelFilter := &logical.BinOp{ @@ -702,7 +673,36 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { expectedDataObjScanProjections: []string{"app", "level", "message", "timestamp"}, }, { - name: "ParseNode collects RangeAggregation PartitionBy ambiguous columns", + name: "parse operation extracts all keys for log queries", + buildLogical: func() logical.Value { + // Create a logical plan that represents: + // {app="test"} | logfmt | level="error" + // This is a log query (no RangeAggregation) so should parse all keys + builder := logical.NewBuilder(&logical.MakeTable{ + Selector: &logical.BinOp{ + Left: logical.NewColumnRef("app", types.ColumnTypeLabel), + Right: logical.NewLiteral("test"), + Op: types.BinaryOpEq, + }, + Shard: logical.NewShard(0, 1), // noShard + }) + + // Don't set RequestedKeys here - optimization should determine them + builder = builder.Parse(logical.ParserLogfmt) + + // Add filter with ambiguous column + filterExpr := &logical.BinOp{ + Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), + Right: logical.NewLiteral("error"), + Op: types.BinaryOpEq, + } + builder = builder.Select(filterExpr) + return builder.Value() + }, + expectedParseKeysRequested: nil, // Log queries should parse all keys + }, + { + name: "RangeAggregation with PartitionBy on ambiguous columns", buildLogical: func() logical.Value { // count_over_time({app="test"} | logfmt [5m]) by (duration, service) builder := logical.NewBuilder(&logical.MakeTable{ @@ -714,7 +714,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { Shard: logical.NewShard(0, 1), }) - builder = builder.Parse(logical.ParserLogfmt) + builder = builder.Parse(types.FunctionOpParseLogfmt) // Range aggregation with PartitionBy builder = builder.RangeAggregation( @@ -735,7 +735,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { expectedDataObjScanProjections: []string{"duration", "message", "service", "timestamp"}, }, { - name: "ParseNode collects ambiguous columns from RangeAggregation and Filter", + name: parse operation collects ambiguous columns from RangeAggregation and Filter", buildLogical: func() logical.Value { // Create a logical plan that represents: // sum by(status,code) (count_over_time({app="test"} | logfmt | duration > 100 [5m])) @@ -749,7 +749,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { }) // Don't set RequestedKeys here - optimization should determine them - builder = builder.Parse(logical.ParserLogfmt) + builder = builder.Parse(types.FunctionOpParseLogfmt) // Add filter with ambiguous column filterExpr := &logical.BinOp{ @@ -782,6 +782,90 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { expectedParseKeysRequested: []string{"code", "duration", "status"}, // sorted alphabetically expectedDataObjScanProjections: []string{"code", "duration", "message", "status", "timestamp"}, }, + { + name: "log query should request all keys even with filters", + buildLogical: func() logical.Value { + // Create a logical plan that represents a log query: + // {app="test"} | logfmt | level="error" | limit 100 + // This is a log query (no range aggregation) so should parse all keys + builder := logical.NewBuilder(&logical.MakeTable{ + Selector: &logical.BinOp{ + Left: logical.NewColumnRef("app", types.ColumnTypeLabel), + Right: logical.NewLiteral("test"), + Op: types.BinaryOpEq, + }, + Shard: logical.NewShard(0, 1), + }) + + // Add parse without specifying RequestedKeys + builder = builder.Parse(types.FunctionOpParseLogfmt) + + // Add filter on ambiguous column + filterExpr := &logical.BinOp{ + Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), + Right: logical.NewLiteral("error"), + Op: types.BinaryOpEq, + } + builder = builder.Select(filterExpr) + + // Add a limit (typical for log queries) + builder = builder.Limit(0, 100) + + return builder.Value() + }, + }, + { + name: "ambiguous projections are consumed, they are not pushed down to DataObjScans", + buildLogical: func() logical.Value { + // Create a logical plan that represents: + // sum by(app) (count_over_time({app="test"} | logfmt | level="error" [5m]) by (status, app)) + selectorPredicate := &logical.BinOp{ + Left: logical.NewColumnRef("app", types.ColumnTypeLabel), + Right: logical.NewLiteral("test"), + Op: types.BinaryOpEq, + } + builder := logical.NewBuilder(&logical.MakeTable{ + Selector: selectorPredicate, + Predicates: []logical.Value{selectorPredicate}, + Shard: logical.NewShard(0, 1), // noShard + }) + + // Don't set RequestedKeys here - optimization should determine them + builder = builder.Parse(types.FunctionOpParseLogfmt) + + // Add filter with ambiguous column (different from grouping field) + filterExpr := &logical.BinOp{ + Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), + Right: logical.NewLiteral("error"), + Op: types.BinaryOpEq, + } + builder = builder.Select(filterExpr) + + // Range aggregation + builder = builder.RangeAggregation( + []logical.ColumnRef{ + {Ref: types.ColumnRef{Column: "status", Type: types.ColumnTypeAmbiguous}}, + {Ref: types.ColumnRef{Column: "app", Type: types.ColumnTypeLabel}}, + }, // no partition by + types.RangeAggregationTypeCount, + time.Unix(0, 0), + time.Unix(3600, 0), + 5*time.Minute, // step + 5*time.Minute, // range interval + ) + + // Vector aggregation with single groupby on parsed field (different from filter field) + builder = builder.VectorAggregation( + []logical.ColumnRef{ + {Ref: types.ColumnRef{Column: "app", Type: types.ColumnTypeLabel}}, + }, + types.VectorAggregationTypeSum, + ) + return builder.Value() + }, + expectedParseKeysRequested: []string{"level", "status"}, + expectedDataObjScanProjections: []string{"app", "message", "timestamp"}, + }, } for _, tt := range tests { @@ -813,12 +897,11 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { optimizedPlan, err := planner.Optimize(physicalPlan) require.NoError(t, err) - // Check that ParseNode and DataObjScan get the correct projections - var parseNode *ParseNode + var projectionNode *Projection projections := map[string]struct{}{} for node := range optimizedPlan.graph.Nodes() { - if pn, ok := node.(*ParseNode); ok { - parseNode = pn + if pn, ok := node.(*Projection); ok { + projectionNode = pn continue } if pn, ok := node.(*ScanSet); ok { @@ -829,14 +912,36 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) { } } + require.NotNil(t, projectionNode, "Projection not found in plan") + var requestedKeys *NamedLiteralExpr + for _, expr := range projectionNode.Expressions { + switch expr := expr.(type) { + case *FunctionExpr: + for _, e := range expr.Expressions { + switch e := e.(type) { + case *NamedLiteralExpr: + if e.Name == types.ParseRequestedKeys { + requestedKeys = e + break + } + } + } + } + } + + if len(tt.expectedParseKeysRequested) == 0 { + require.Nil(t, requestedKeys, "Projection should have no requested keys") + } else { + require.NotNil(t, requestedKeys, "Projection should have requested keys") + actual := requestedKeys.Literal.(types.StringListLiteral) + require.Equal(t, tt.expectedParseKeysRequested, actual.Value()) + } + var projectionArr []string for column := range projections { projectionArr = append(projectionArr, column) } sort.Strings(projectionArr) - - require.NotNil(t, parseNode, "ParseNode not found in plan") - require.Equal(t, tt.expectedParseKeysRequested, parseNode.RequestedKeys) require.Equal(t, tt.expectedDataObjScanProjections, projectionArr) }) } @@ -976,45 +1081,6 @@ func Test_parallelPushdown(t *testing.T) { require.Equal(t, expected, PrintAsTree(&plan)) }) - t.Run("Shifts Parse", func(t *testing.T) { - var plan Plan - { - vectorAgg := plan.graph.Add(&VectorAggregation{}) - rangeAgg := plan.graph.Add(&RangeAggregation{}) - parse := plan.graph.Add(&ParseNode{}) - parallelize := plan.graph.Add(&Parallelize{}) - scan := plan.graph.Add(&DataObjScan{}) - - require.NoError(t, plan.graph.AddEdge(dag.Edge[Node]{Parent: vectorAgg, Child: rangeAgg})) - require.NoError(t, plan.graph.AddEdge(dag.Edge[Node]{Parent: rangeAgg, Child: parse})) - require.NoError(t, plan.graph.AddEdge(dag.Edge[Node]{Parent: parse, Child: parallelize})) - require.NoError(t, plan.graph.AddEdge(dag.Edge[Node]{Parent: parallelize, Child: scan})) - } - - opt := newOptimizer(&plan, []*optimization{ - newOptimization("ParallelPushdown", &plan).withRules(¶llelPushdown{plan: &plan}), - }) - root, _ := plan.graph.Root() - opt.optimize(root) - - var expectedPlan Plan - { - vectorAgg := expectedPlan.graph.Add(&VectorAggregation{}) - rangeAgg := expectedPlan.graph.Add(&RangeAggregation{}) - parallelize := expectedPlan.graph.Add(&Parallelize{}) - parse := expectedPlan.graph.Add(&ParseNode{}) - scan := expectedPlan.graph.Add(&DataObjScan{}) - - require.NoError(t, expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: vectorAgg, Child: rangeAgg})) - require.NoError(t, expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: rangeAgg, Child: parallelize})) - require.NoError(t, expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: parallelize, Child: parse})) - require.NoError(t, expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: parse, Child: scan})) - } - - expected := PrintAsTree(&expectedPlan) - require.Equal(t, expected, PrintAsTree(&plan)) - }) - t.Run("Splits TopK", func(t *testing.T) { var plan Plan { diff --git a/pkg/engine/internal/planner/physical/parse.go b/pkg/engine/internal/planner/physical/parse.go deleted file mode 100644 index ad02f482218a7..0000000000000 --- a/pkg/engine/internal/planner/physical/parse.go +++ /dev/null @@ -1,55 +0,0 @@ -package physical - -import ( - "fmt" - "slices" -) - -// ParseNode represents a parsing operation in the physical plan. -// It extracts structured fields from log lines using the specified parser. -type ParseNode struct { - id string - Kind ParserKind - RequestedKeys []string -} - -// ParserKind represents the type of parser to use -type ParserKind int - -const ( - ParserInvalid ParserKind = iota - ParserLogfmt - ParserJSON -) - -func (p ParserKind) String() string { - switch p { - case ParserLogfmt: - return "logfmt" - case ParserJSON: - return "json" - default: - return "invalid" - } -} - -// ID returns a unique identifier for this ParseNode -func (n *ParseNode) ID() string { - if n.id != "" { - return n.id - } - return fmt.Sprintf("%p", n) -} - -// Clone returns a deep copy of the node (minus its ID). -func (n *ParseNode) Clone() Node { - return &ParseNode{ - Kind: n.Kind, - RequestedKeys: slices.Clone(n.RequestedKeys), - } -} - -// Type returns the node type -func (n *ParseNode) Type() NodeType { - return NodeTypeParse -} diff --git a/pkg/engine/internal/planner/physical/plan.go b/pkg/engine/internal/planner/physical/plan.go index 085ba69a472f7..4d171fc988c57 100644 --- a/pkg/engine/internal/planner/physical/plan.go +++ b/pkg/engine/internal/planner/physical/plan.go @@ -98,7 +98,6 @@ var _ Node = (*Limit)(nil) var _ Node = (*Filter)(nil) var _ Node = (*RangeAggregation)(nil) var _ Node = (*VectorAggregation)(nil) -var _ Node = (*ParseNode)(nil) var _ Node = (*ColumnCompat)(nil) var _ Node = (*TopK)(nil) var _ Node = (*Parallelize)(nil) @@ -111,7 +110,6 @@ func (*Limit) isNode() {} func (*Filter) isNode() {} func (*RangeAggregation) isNode() {} func (*VectorAggregation) isNode() {} -func (*ParseNode) isNode() {} func (*ColumnCompat) isNode() {} func (*TopK) isNode() {} func (*Parallelize) isNode() {} diff --git a/pkg/engine/internal/planner/physical/planner.go b/pkg/engine/internal/planner/physical/planner.go index 077943489f472..5982d872a8afb 100644 --- a/pkg/engine/internal/planner/physical/planner.go +++ b/pkg/engine/internal/planner/physical/planner.go @@ -128,6 +128,15 @@ func (p *Planner) convertPredicate(inst logical.Value) Expression { return &ColumnExpr{Ref: inst.Ref} case *logical.Literal: return NewLiteral(inst.Value()) + case *logical.FunctionOp: + exprs := make([]Expression, len(inst.Values)) + for i, v := range inst.Values { + exprs[i] = p.convertPredicate(v) + } + return &FunctionExpr{ + Op: inst.Op, + Expressions: exprs, + } default: panic(fmt.Sprintf("invalid value for predicate: %T", inst)) } @@ -150,8 +159,6 @@ func (p *Planner) process(inst logical.Value, ctx *Context) (Node, error) { return p.processRangeAggregation(inst, ctx) case *logical.VectorAggregation: return p.processVectorAggregation(inst, ctx) - case *logical.Parse: - return p.processParse(inst, ctx) case *logical.BinOp: return p.processBinOp(inst, ctx) case *logical.UnaryOp: @@ -542,38 +549,6 @@ func (p *Planner) processUnaryOp(lp *logical.UnaryOp, ctx *Context) (Node, error return node, nil } -// Convert [logical.Parse] into one [ParseNode] node. -// A ParseNode initially has an empty list of RequestedKeys which will be populated during optimization. -func (p *Planner) processParse(lp *logical.Parse, ctx *Context) (Node, error) { - var node Node = &ParseNode{ - Kind: convertParserKind(lp.Kind), - } - p.plan.graph.Add(node) - - child, err := p.process(lp.Table, ctx) - if err != nil { - return nil, err - } - - if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: node, Child: child}); err != nil { - return nil, err - } - - if p.context.v1Compatible { - compat := &ColumnCompat{ - Source: types.ColumnTypeParsed, - Destination: types.ColumnTypeParsed, - Collision: types.ColumnTypeLabel, - } - node, err = p.wrapNodeWith(node, compat) - if err != nil { - return nil, err - } - } - - return node, nil -} - func (p *Planner) wrapNodeWith(node Node, wrapper Node) (Node, error) { p.plan.graph.Add(wrapper) if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: wrapper, Child: node}); err != nil { @@ -616,14 +591,3 @@ func (p *Planner) Optimize(plan *Plan) (*Plan, error) { } return plan, nil } - -func convertParserKind(kind logical.ParserKind) ParserKind { - switch kind { - case logical.ParserLogfmt: - return ParserLogfmt - case logical.ParserJSON: - return ParserJSON - default: - return ParserInvalid - } -} diff --git a/pkg/engine/internal/planner/physical/planner_test.go b/pkg/engine/internal/planner/physical/planner_test.go index a3779f4bca67c..072bbf25129d9 100644 --- a/pkg/engine/internal/planner/physical/planner_test.go +++ b/pkg/engine/internal/planner/physical/planner_test.go @@ -270,7 +270,7 @@ func TestPlanner_Convert_WithParse(t *testing.T) { Shard: logical.NewShard(0, 1), }, ).Parse( - logical.ParserLogfmt, + types.FunctionOpParseLogfmt, ).Select( &logical.BinOp{ Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), @@ -290,7 +290,7 @@ func TestPlanner_Convert_WithParse(t *testing.T) { planner := NewPlanner(NewContext(time.Now(), time.Now()), catalog) physicalPlan, err := planner.Build(logicalPlan) - t.Logf("Physical plan\n%s\n", PrintAsTree(physicalPlan)) + t.Logf("\nPhysical plan\n%s\n", PrintAsTree(physicalPlan)) require.NoError(t, err) // Verify ParseNode exists in correct position @@ -304,24 +304,33 @@ func TestPlanner_Convert_WithParse(t *testing.T) { children := physicalPlan.Children(filterNode) require.Len(t, children, 1) - compatNode, ok := children[0].(*ColumnCompat) - require.True(t, ok, "Filter's child should be ColumnCompat") - require.Equal(t, types.ColumnTypeParsed, compatNode.Source) + projectionNode, ok := children[0].(*Projection) + require.True(t, ok, "Filter's child should be Projection") + require.Len(t, projectionNode.Expressions, 1) - children = physicalPlan.Children(compatNode) - require.Len(t, children, 1) + expr, ok := projectionNode.Expressions[0].(*FunctionExpr) + require.True(t, ok) + require.Equal(t, types.FunctionOpParseLogfmt, expr.Op) + + funcArgs := expr.Expressions + require.Len(t, funcArgs, 1) - parseNode, ok := children[0].(*ParseNode) - require.True(t, ok, "ColumnCompat's child should be ParseNode") - require.Equal(t, ParserLogfmt, parseNode.Kind) - require.Empty(t, parseNode.RequestedKeys) + sourcCol, ok := funcArgs[0].(*ColumnExpr) + require.True(t, ok) + require.Equal(t, types.ColumnNameBuiltinMessage, sourcCol.Ref.Column) + require.Equal(t, types.ColumnTypeBuiltin, sourcCol.Ref.Type) physicalPlan, err = planner.Optimize(physicalPlan) - t.Logf("Optimized plan\n%s\n", PrintAsTree(physicalPlan)) + t.Logf("\nOptimized plan\n%s\n", PrintAsTree(physicalPlan)) require.NoError(t, err) - // For log queries, parse nodes should request all keys (nil) - require.Nil(t, parseNode.RequestedKeys) + funcArgs = expr.Expressions + require.Len(t, funcArgs, 1) + + sourcCol, ok = funcArgs[0].(*ColumnExpr) + require.True(t, ok) + require.Equal(t, types.ColumnNameBuiltinMessage, sourcCol.Ref.Column) + require.Equal(t, types.ColumnTypeBuiltin, sourcCol.Ref.Type) }) t.Run("Build a query plan for a metric query with Parse", func(t *testing.T) { @@ -340,7 +349,7 @@ func TestPlanner_Convert_WithParse(t *testing.T) { Shard: logical.NewShard(0, 1), }, ).Parse( - logical.ParserLogfmt, + types.FunctionOpParseLogfmt, ).Select( &logical.BinOp{ Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), @@ -367,32 +376,61 @@ func TestPlanner_Convert_WithParse(t *testing.T) { planner := NewPlanner(NewContext(start, end), catalog) physicalPlan, err := planner.Build(logicalPlan) - t.Logf("Physical plan\n%s\n", PrintAsTree(physicalPlan)) + t.Logf("\nPhysical plan\n%s\n", PrintAsTree(physicalPlan)) require.NoError(t, err) - // Find ParseNode in the plan - var parseNode *ParseNode + // Verify ParseNode exists in correct position root, err := physicalPlan.Root() require.NoError(t, err) - _ = physicalPlan.DFSWalk(root, func(n Node) error { - switch node := n.(type) { - case *ParseNode: - parseNode = node - } - return nil - }, dag.PreOrderWalk) - require.NotNil(t, parseNode, "ParseNode should exist in the plan") + // Physical plan is built bottom up, so it should be RangeAggregation -> Filter -> Projection -> ... + rangeAgg, ok := root.(*RangeAggregation) + require.True(t, ok, "Root should be RangeAggregation") + + children := physicalPlan.Children(rangeAgg) + require.Len(t, children, 1) + + filterNode, ok := children[0].(*Filter) + require.True(t, ok, "RangeAggregation's child should be Filter") + + children = physicalPlan.Children(filterNode) + require.Len(t, children, 1) + + projectionNode, ok := children[0].(*Projection) + require.True(t, ok, "Filter's child should be Projection") + require.Len(t, projectionNode.Expressions, 1) + + expr, ok := projectionNode.Expressions[0].(*FunctionExpr) + require.True(t, ok) + require.Equal(t, types.FunctionOpParseLogfmt, expr.Op) - require.Equal(t, ParserLogfmt, parseNode.Kind) - require.Empty(t, parseNode.RequestedKeys) // Before optimization + funcArgs := expr.Expressions + require.Len(t, funcArgs, 1) + + sourcCol, ok := funcArgs[0].(*ColumnExpr) + require.True(t, ok) + require.Equal(t, types.ColumnNameBuiltinMessage, sourcCol.Ref.Column) + require.Equal(t, types.ColumnTypeBuiltin, sourcCol.Ref.Type) physicalPlan, err = planner.Optimize(physicalPlan) - t.Logf("Optimized plan\n%s\n", PrintAsTree(physicalPlan)) + t.Logf("\nOptimized plan\n%s\n", PrintAsTree(physicalPlan)) require.NoError(t, err) - // For metric queries, parse nodes should request specific keys used in aggregations - require.Equal(t, []string{"level"}, parseNode.RequestedKeys) + funcArgs = expr.Expressions + require.Len(t, funcArgs, 2) + + sourcCol, ok = funcArgs[0].(*ColumnExpr) + require.True(t, ok) + require.Equal(t, types.ColumnNameBuiltinMessage, sourcCol.Ref.Column) + require.Equal(t, types.ColumnTypeBuiltin, sourcCol.Ref.Type) + + reqKeys, ok := funcArgs[1].(*NamedLiteralExpr) + require.True(t, ok) + require.Equal(t, types.ParseRequestedKeys, reqKeys.Name) + + keys, ok := reqKeys.Literal.(types.StringListLiteral) + require.True(t, ok) + require.Equal(t, []string{"level"}, keys.Value()) }) } diff --git a/pkg/engine/internal/planner/physical/printer.go b/pkg/engine/internal/planner/physical/printer.go index 26dadf0b3d2fa..a45d1ef597edf 100644 --- a/pkg/engine/internal/planner/physical/printer.go +++ b/pkg/engine/internal/planner/physical/printer.go @@ -80,13 +80,6 @@ func toTreeNode(n Node) *tree.Node { if len(node.GroupBy) > 0 { treeNode.Properties = append(treeNode.Properties, tree.NewProperty("group_by", true, toAnySlice(node.GroupBy)...)) } - case *ParseNode: - treeNode.Properties = []tree.Property{ - tree.NewProperty("kind", false, node.Kind.String()), - } - if len(node.RequestedKeys) > 0 { - treeNode.Properties = append(treeNode.Properties, tree.NewProperty("requested_keys", true, toAnySlice(node.RequestedKeys)...)) - } case *ColumnCompat: treeNode.Properties = []tree.Property{ tree.NewProperty("src", false, node.Source), diff --git a/pkg/engine/internal/planner/planner_test.go b/pkg/engine/internal/planner/planner_test.go index 54fa08e4ed180..b942dd3fde4de 100644 --- a/pkg/engine/internal/planner/planner_test.go +++ b/pkg/engine/internal/planner/planner_test.go @@ -266,6 +266,35 @@ VectorAggregation operation=sum group_by=(ambiguous.bar) └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, + { + comment: "parse logfmt", + query: `{app="foo"} | logfmt`, + expected: ` +Limit offset=0 limit=1000 +└── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 + └── Parallelize + └── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + `, + }, + { + comment: "parse logfmt with grouping", + query: `sum by (bar) (count_over_time({app="foo"} | logfmt[1m]))`, + expected: ` +VectorAggregation +└── RangeAggregation operation=count start=2025-01-01T00:00:00Z end=2025-01-01T01:00:00Z step=0s range=1m0s partition_by=(ambiguous.bar) + └── Parallelize + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, requestedKeys: [bar])) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 projections=(builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + `, + }, } for _, tc := range testCases { diff --git a/pkg/engine/internal/semconv/identifier.go b/pkg/engine/internal/semconv/identifier.go index 357dc7d096fa5..2edbf87b55306 100644 --- a/pkg/engine/internal/semconv/identifier.go +++ b/pkg/engine/internal/semconv/identifier.go @@ -101,6 +101,13 @@ func (i *Identifier) Equal(other *Identifier) bool { i.dataType == other.dataType } +func (i *Identifier) ColumnRef() types.ColumnRef { + return types.ColumnRef{ + Column: i.ShortName(), + Type: i.columnType, + } +} + // FQN returns a fully qualified name for a column by given name, column type, and data type. func FQN(name string, ct types.ColumnType, dt types.DataType) string { return NewIdentifier(name, ct, dt).FQN() diff --git a/pkg/engine/internal/types/column.go b/pkg/engine/internal/types/column.go index cfd2caf143e5a..f7b60e0ca35ca 100644 --- a/pkg/engine/internal/types/column.go +++ b/pkg/engine/internal/types/column.go @@ -54,6 +54,11 @@ const ( SampleExtractionErrorType = "SampleExtractionErr" ) +// Named function arguments +const ( + ParseRequestedKeys = "requestedKeys" +) + var ctNames = [7]string{"invalid", "builtin", "label", "metadata", "parsed", "ambiguous", "generated"} // String returns a human-readable representation of the column type. diff --git a/pkg/engine/internal/types/literal.go b/pkg/engine/internal/types/literal.go index ed6029334f9eb..acca5e0a65de1 100644 --- a/pkg/engine/internal/types/literal.go +++ b/pkg/engine/internal/types/literal.go @@ -3,8 +3,10 @@ package types //nolint:revive import ( "fmt" "strconv" + "strings" "time" + "github.com/apache/arrow-go/v18/arrow" "github.com/dustin/go-humanize" "github.com/grafana/loki/v3/pkg/engine/internal/util" @@ -179,6 +181,30 @@ func (b BytesLiteral) Value() Bytes { return Bytes(b) } +type StringListLiteral []string + +// String implements Literal. +func (l StringListLiteral) String() string { + return "[" + strings.Join(l, ", ") + "]" +} + +// Type implements Literal. +func (l StringListLiteral) Type() DataType { + tList := tList{ + arrowType: arrow.ListOf(arrow.BinaryTypes.String), + } + return tList +} + +// Any implements Literal. +func (l StringListLiteral) Any() any { + return l.Value() +} + +func (l StringListLiteral) Value() []string { + return l +} + // Literal is holds a value of [any] typed as [DataType]. type Literal interface { fmt.Stringer @@ -212,6 +238,8 @@ var ( _ TypedLiteral[Duration] = (*DurationLiteral)(nil) _ Literal = (*BytesLiteral)(nil) _ TypedLiteral[Bytes] = (*BytesLiteral)(nil) + _ Literal = (*StringListLiteral)(nil) + _ TypedLiteral[[]string] = (*StringListLiteral)(nil) ) func NewLiteral[T LiteralType](value T) Literal { @@ -230,6 +258,8 @@ func NewLiteral[T LiteralType](value T) Literal { return DurationLiteral(val) case Bytes: return BytesLiteral(val) + case []string: + return StringListLiteral(val) } panic(fmt.Sprintf("invalid literal value type %T", value)) } diff --git a/pkg/engine/internal/types/operators.go b/pkg/engine/internal/types/operators.go index 29d2eb3f3f841..8eb17aa04a868 100644 --- a/pkg/engine/internal/types/operators.go +++ b/pkg/engine/internal/types/operators.go @@ -123,3 +123,27 @@ func (t BinaryOp) String() string { panic(fmt.Sprintf("unknown binary operator %d", t)) } } + +// FunctionOp denotes the kind of [FunctionOp] operation to perform. +type FunctionOp uint32 + +// Recognized values of [FunctionOp]. +const ( + // VariadicOpKindInvalid indicates an invalid unary operation. + FunctionOpInvalid FunctionOp = iota + + FunctionOpParseLogfmt // Parse logfmt line to set of columns operation (logfmt). + FunctionOpParseJSON // Parse JSON line to set of columns operation (json). +) + +// String returns the string representation of the UnaryOp. +func (t FunctionOp) String() string { + switch t { + case FunctionOpParseLogfmt: + return "PARSE_LOGFMT" + case FunctionOpParseJSON: + return "PARSE_JSON" + default: + panic(fmt.Sprintf("unknown unary operator %d", t)) + } +} diff --git a/pkg/engine/internal/types/types.go b/pkg/engine/internal/types/types.go index b682b9255cf35..7ba8bc6fc2f90 100644 --- a/pkg/engine/internal/types/types.go +++ b/pkg/engine/internal/types/types.go @@ -20,6 +20,7 @@ const ( FLOAT64 = Type(arrow.FLOAT64) TIMESTAMP = Type(arrow.TIMESTAMP) STRUCT = Type(arrow.STRUCT) + LIST = Type(arrow.LIST) ) func (t Type) String() string { @@ -110,6 +111,14 @@ func NewStructType(arrowType *arrow.StructType) DataType { return tStruct{arrowType: arrowType} } +type tList struct { + arrowType *arrow.ListType +} + +func (t tList) ID() Type { return LIST } +func (t tList) String() string { return "list" } +func (t tList) ArrowType() arrow.DataType { return t.arrowType } + var ( names = map[string]DataType{ Loki.Null.String(): Loki.Null, diff --git a/pkg/engine/internal/workflow/workflow_planner_test.go b/pkg/engine/internal/workflow/workflow_planner_test.go index 73241a453e726..3dc2bf2a2302a 100644 --- a/pkg/engine/internal/workflow/workflow_planner_test.go +++ b/pkg/engine/internal/workflow/workflow_planner_test.go @@ -8,6 +8,8 @@ import ( "github.com/stretchr/testify/require" "github.com/grafana/loki/v3/pkg/engine/internal/planner/physical" + "github.com/grafana/loki/v3/pkg/engine/internal/semconv" + "github.com/grafana/loki/v3/pkg/engine/internal/types" "github.com/grafana/loki/v3/pkg/engine/internal/util/dag" ) @@ -127,7 +129,22 @@ RangeAggregation operation=invalid start=0001-01-01T00:00:00Z end=0001-01-01T00: parallelize = physicalGraph.Add(&physical.Parallelize{}) filter = physicalGraph.Add(&physical.Filter{}) - parse = physicalGraph.Add(&physical.ParseNode{Kind: physical.ParserLogfmt}) + project = physicalGraph.Add( + &physical.Projection{ + Expressions: []physical.Expression{ + &physical.FunctionExpr{ + Op: types.FunctionOpParseLogfmt, + Expressions: []physical.Expression{ + &physical.ColumnExpr{ + Ref: semconv.ColumnIdentMessage.ColumnRef(), + }, + }, + }, + }, + All: true, + Expand: true, + }, + ) scanSet = physicalGraph.Add(&physical.ScanSet{ Targets: []*physical.ScanTarget{ {Type: physical.ScanTypeDataObject, DataObject: &physical.DataObjScan{Location: "a"}}, @@ -140,8 +157,8 @@ RangeAggregation operation=invalid start=0001-01-01T00:00:00Z end=0001-01-01T00: _ = physicalGraph.AddEdge(dag.Edge[physical.Node]{Parent: vectorAgg, Child: rangeAgg}) _ = physicalGraph.AddEdge(dag.Edge[physical.Node]{Parent: rangeAgg, Child: parallelize}) _ = physicalGraph.AddEdge(dag.Edge[physical.Node]{Parent: parallelize, Child: filter}) - _ = physicalGraph.AddEdge(dag.Edge[physical.Node]{Parent: filter, Child: parse}) - _ = physicalGraph.AddEdge(dag.Edge[physical.Node]{Parent: parse, Child: scanSet}) + _ = physicalGraph.AddEdge(dag.Edge[physical.Node]{Parent: filter, Child: project}) + _ = physicalGraph.AddEdge(dag.Edge[physical.Node]{Parent: project, Child: scanSet}) physicalPlan := physical.FromGraph(physicalGraph) @@ -169,21 +186,21 @@ Task 00000000000000000000000003 ------------------------------- Filter │ └── @sink stream=00000000000000000000000007 -└── Parse kind=logfmt +└── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) └── DataObjScan location=a streams=0 section_id=0 projections=() Task 00000000000000000000000004 ------------------------------- Filter │ └── @sink stream=00000000000000000000000008 -└── Parse kind=logfmt +└── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) └── DataObjScan location=b streams=0 section_id=0 projections=() Task 00000000000000000000000005 ------------------------------- Filter │ └── @sink stream=00000000000000000000000009 -└── Parse kind=logfmt +└── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) └── DataObjScan location=c streams=0 section_id=0 projections=() `) From 93be247aa7e4a418b98f2afc2e79c7e9149b0a6e Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Fri, 24 Oct 2025 14:04:07 -0600 Subject: [PATCH 02/12] chore: merge cleanup --- .../planner/physical/optimizer_test.go | 112 +++++------------- 1 file changed, 30 insertions(+), 82 deletions(-) diff --git a/pkg/engine/internal/planner/physical/optimizer_test.go b/pkg/engine/internal/planner/physical/optimizer_test.go index 8a518be1c8b36..cef0e4de8c734 100644 --- a/pkg/engine/internal/planner/physical/optimizer_test.go +++ b/pkg/engine/internal/planner/physical/optimizer_test.go @@ -627,6 +627,35 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { return builder.Value() }, }, + { + name: "parse operation extracts all keys for log queries", + buildLogical: func() logical.Value { + // Create a logical plan that represents: + // {app="test"} | logfmt | level="error" + // This is a log query (no RangeAggregation) so should parse all keys + builder := logical.NewBuilder(&logical.MakeTable{ + Selector: &logical.BinOp{ + Left: logical.NewColumnRef("app", types.ColumnTypeLabel), + Right: logical.NewLiteral("test"), + Op: types.BinaryOpEq, + }, + Shard: logical.NewShard(0, 1), // noShard + }) + + // Don't set RequestedKeys here - optimization should determine them + builder = builder.Parse(logical.ParserLogfmt) + + // Add filter with ambiguous column + filterExpr := &logical.BinOp{ + Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), + Right: logical.NewLiteral("error"), + Op: types.BinaryOpEq, + } + builder = builder.Select(filterExpr) + return builder.Value() + }, + expectedParseKeysRequested: nil, // Log queries should parse all keys + }, { name: "skips label and builtin columns, only collects ambiguous", buildLogical: func() logical.Value { @@ -672,35 +701,6 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { expectedParseKeysRequested: []string{"level"}, expectedDataObjScanProjections: []string{"app", "level", "message", "timestamp"}, }, - { - name: "parse operation extracts all keys for log queries", - buildLogical: func() logical.Value { - // Create a logical plan that represents: - // {app="test"} | logfmt | level="error" - // This is a log query (no RangeAggregation) so should parse all keys - builder := logical.NewBuilder(&logical.MakeTable{ - Selector: &logical.BinOp{ - Left: logical.NewColumnRef("app", types.ColumnTypeLabel), - Right: logical.NewLiteral("test"), - Op: types.BinaryOpEq, - }, - Shard: logical.NewShard(0, 1), // noShard - }) - - // Don't set RequestedKeys here - optimization should determine them - builder = builder.Parse(logical.ParserLogfmt) - - // Add filter with ambiguous column - filterExpr := &logical.BinOp{ - Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), - Right: logical.NewLiteral("error"), - Op: types.BinaryOpEq, - } - builder = builder.Select(filterExpr) - return builder.Value() - }, - expectedParseKeysRequested: nil, // Log queries should parse all keys - }, { name: "RangeAggregation with PartitionBy on ambiguous columns", buildLogical: func() logical.Value { @@ -735,7 +735,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { expectedDataObjScanProjections: []string{"duration", "message", "service", "timestamp"}, }, { - name: parse operation collects ambiguous columns from RangeAggregation and Filter", + name: "parse operation collects ambiguous columns from RangeAggregation and Filter", buildLogical: func() logical.Value { // Create a logical plan that represents: // sum by(status,code) (count_over_time({app="test"} | logfmt | duration > 100 [5m])) @@ -814,58 +814,6 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { return builder.Value() }, }, - { - name: "ambiguous projections are consumed, they are not pushed down to DataObjScans", - buildLogical: func() logical.Value { - // Create a logical plan that represents: - // sum by(app) (count_over_time({app="test"} | logfmt | level="error" [5m]) by (status, app)) - selectorPredicate := &logical.BinOp{ - Left: logical.NewColumnRef("app", types.ColumnTypeLabel), - Right: logical.NewLiteral("test"), - Op: types.BinaryOpEq, - } - builder := logical.NewBuilder(&logical.MakeTable{ - Selector: selectorPredicate, - Predicates: []logical.Value{selectorPredicate}, - Shard: logical.NewShard(0, 1), // noShard - }) - - // Don't set RequestedKeys here - optimization should determine them - builder = builder.Parse(types.FunctionOpParseLogfmt) - - // Add filter with ambiguous column (different from grouping field) - filterExpr := &logical.BinOp{ - Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), - Right: logical.NewLiteral("error"), - Op: types.BinaryOpEq, - } - builder = builder.Select(filterExpr) - - // Range aggregation - builder = builder.RangeAggregation( - []logical.ColumnRef{ - {Ref: types.ColumnRef{Column: "status", Type: types.ColumnTypeAmbiguous}}, - {Ref: types.ColumnRef{Column: "app", Type: types.ColumnTypeLabel}}, - }, // no partition by - types.RangeAggregationTypeCount, - time.Unix(0, 0), - time.Unix(3600, 0), - 5*time.Minute, // step - 5*time.Minute, // range interval - ) - - // Vector aggregation with single groupby on parsed field (different from filter field) - builder = builder.VectorAggregation( - []logical.ColumnRef{ - {Ref: types.ColumnRef{Column: "app", Type: types.ColumnTypeLabel}}, - }, - types.VectorAggregationTypeSum, - ) - return builder.Value() - }, - expectedParseKeysRequested: []string{"level", "status"}, - expectedDataObjScanProjections: []string{"app", "message", "timestamp"}, - }, } for _, tt := range tests { From 59d3fecba0660353ed1b8d81926385e45199c39a Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Mon, 27 Oct 2025 09:51:19 -0600 Subject: [PATCH 03/12] chore: PR comments and rebase/merge fixes --- pkg/engine/internal/executor/column.go | 12 ++ pkg/engine/internal/executor/expressions.go | 18 +-- .../internal/executor/expressions_test.go | 65 +++------ pkg/engine/internal/executor/functions.go | 45 +++--- pkg/engine/internal/executor/parse.go | 57 ++++---- .../internal/planner/logical/builder.go | 2 +- .../planner/logical/builder_convert.go | 2 + .../internal/planner/logical/function_op.go | 2 +- .../internal/planner/logical/planner.go | 4 +- .../internal/planner/physical/expressions.go | 66 +++------ .../internal/planner/physical/optimizer.go | 136 ++++++++++-------- .../planner/physical/optimizer_test.go | 23 ++- .../internal/planner/physical/planner.go | 2 +- .../internal/planner/physical/planner_test.go | 15 +- pkg/engine/internal/types/arrow.go | 2 + pkg/engine/internal/types/column.go | 5 - pkg/engine/internal/types/operators.go | 20 +-- pkg/engine/internal/types/types.go | 3 + pkg/engine/internal/util/array_list.go | 21 +++ .../workflow/workflow_planner_test.go | 4 +- 20 files changed, 248 insertions(+), 256 deletions(-) create mode 100644 pkg/engine/internal/util/array_list.go diff --git a/pkg/engine/internal/executor/column.go b/pkg/engine/internal/executor/column.go index 001a8314a4aa3..d66aed786b106 100644 --- a/pkg/engine/internal/executor/column.go +++ b/pkg/engine/internal/executor/column.go @@ -51,6 +51,18 @@ func NewScalar(value types.Literal, rows int) arrow.Array { for range rows { builder.Append(arrow.Timestamp(value)) } + case *array.ListBuilder: + //TODO(twhitney): currently only supporting string list, but we can add more types here as we need them + value, ok := value.Any().([]string) + if ok { + valueBuilder := builder.ValueBuilder().(*array.StringBuilder) + for range rows { + builder.Append(true) + for _, val := range value { + valueBuilder.Append(val) + } + } + } } return builder.NewArray() } diff --git a/pkg/engine/internal/executor/expressions.go b/pkg/engine/internal/executor/expressions.go index bd303750ca0be..460c5dbe6352c 100644 --- a/pkg/engine/internal/executor/expressions.go +++ b/pkg/engine/internal/executor/expressions.go @@ -26,18 +26,6 @@ func (e expressionEvaluator) eval(expr physical.Expression, input arrow.Record) case *physical.ColumnExpr: colIdent := semconv.NewIdentifier(expr.Ref.Column, expr.Ref.Type, types.Loki.String) - // For non-ambiguous columns, we can look up the column in the schema by its fully qualified name. - if expr.Ref.Type != types.ColumnTypeAmbiguous { - for idx, field := range input.Schema().Fields() { - ident, err := semconv.ParseFQN(field.Name) - if err != nil { - return nil, fmt.Errorf("failed to parse column %s: %w", field.Name, err) - } - if ident.ShortName() == colIdent.ShortName() && ident.ColumnType() == colIdent.ColumnType() { - return input.Column(idx), nil - case *physical.ColumnExpr: - colIdent := semconv.NewIdentifier(expr.Ref.Column, expr.Ref.Type, types.Loki.String) - // For non-ambiguous columns, we can look up the column in the schema by its fully qualified name. if expr.Ref.Type != types.ColumnTypeAmbiguous { for idx, field := range input.Schema().Fields() { @@ -137,8 +125,8 @@ func (e expressionEvaluator) eval(expr physical.Expression, input arrow.Record) _, rhsIsScalar := expr.Right.(*physical.LiteralExpr) return fn.Evaluate(lhs, rhs, lhsIsScalar, rhsIsScalar) - case *physical.FunctionExpr: - args := make(arrow.Array, len(expr.Expressions)) + case *physical.VariadicExpr: + args := make([]arrow.Array, len(expr.Expressions)) for i, arg := range expr.Expressions { p, err := e.eval(arg, input) if err != nil { @@ -147,7 +135,7 @@ func (e expressionEvaluator) eval(expr physical.Expression, input arrow.Record) args[i] = p } - fn, err := functions.GetForSignature(expr.Op) + fn, err := variadicFunctions.GetForSignature(expr.Op) if err != nil { return nil, fmt.Errorf("failed to lookup unary function: %w", err) } diff --git a/pkg/engine/internal/executor/expressions_test.go b/pkg/engine/internal/executor/expressions_test.go index 7758cb4537346..b3b5b35ce057b 100644 --- a/pkg/engine/internal/executor/expressions_test.go +++ b/pkg/engine/internal/executor/expressions_test.go @@ -12,6 +12,7 @@ import ( "github.com/grafana/loki/v3/pkg/engine/internal/planner/physical" "github.com/grafana/loki/v3/pkg/engine/internal/semconv" "github.com/grafana/loki/v3/pkg/engine/internal/types" + "github.com/grafana/loki/v3/pkg/engine/internal/util" "github.com/grafana/loki/v3/pkg/util/arrowtest" ) @@ -85,7 +86,7 @@ func TestEvaluateLiteralExpression(t *testing.T) { arrowType: arrow.INT64, }, { - name: "list", + name: "string list", value: []string{"a", "b", "c"}, arrowType: arrow.LIST, }, @@ -116,6 +117,8 @@ func TestEvaluateLiteralExpression(t *testing.T) { val = arr.Value(i) case *array.Timestamp: val = arr.Value(i) + case *array.List: + val = util.ArrayListValue(arr, i) } if tt.want != nil { require.Equal(t, tt.want, val) @@ -127,30 +130,6 @@ func TestEvaluateLiteralExpression(t *testing.T) { } } -func TestEvaluateNamedListeralListExpression(t *testing.T) { - colMsg := "utf8.builtin.message" - lit := &physical.NamedLiteralExpr{ - Name: "test", - Literal: types.NewLiteral([]string{"a", "b", "c"}), - } - types.NewLiteral([]string{"a", "b", "c"}) - e := newExpressionEvaluator() - schema := arrow.NewSchema([]arrow.Field{ - semconv.FieldFromIdent(semconv.ColumnIdentMessage, false), - }, nil) - input := arrowtest.Rows{ - {colMsg: `some log`}, - {colMsg: `some other log`}, - } - record := input.Record(memory.DefaultAllocator, schema) - result, err := e.eval(lit, record) - require.NoError(t, err) - - require.Equal(t, int64(2), result.Len()) - require.Equal(t, []string{"a", "b", "c"}, result.Value(0)) - require.Equal(t, []string{"a", "b", "c"}, result.Value(1)) -} - func TestEvaluateColumnExpression(t *testing.T) { e := newExpressionEvaluator() @@ -790,14 +769,13 @@ func TestEvaluateParseExpression_Logfmt(t *testing.T) { }, } { t.Run(tt.name, func(t *testing.T) { - expr := &physical.FunctionExpr{ - Op: types.FunctionOpParseLogfmt, + expr := &physical.VariadicExpr{ + Op: types.VariadicOpParseLogfmt, Expressions: []physical.Expression{ &physical.ColumnExpr{ Ref: semconv.ColumnIdentMessage.ColumnRef(), }, - &physical.NamedLiteralExpr{ - Name: types.ParseRequestedKeys, + &physical.LiteralExpr{ Literal: types.NewLiteral(tt.requestedKeys), }, }, @@ -805,12 +783,12 @@ func TestEvaluateParseExpression_Logfmt(t *testing.T) { e := newExpressionEvaluator() record := tt.input.Record(memory.DefaultAllocator, tt.schema) - colVec, err := e.eval(expr, record) + col, err := e.eval(expr, record) require.NoError(t, err) - id := colVec.Type().ArrowType().ID() + id := col.DataType().ID() require.Equal(t, arrow.STRUCT, id) - arr, ok := colVec.ToArray().(*array.Struct) + arr, ok := col.(*array.Struct) require.True(t, ok) defer arr.Release() @@ -1187,14 +1165,13 @@ func TestEvaluateParseExpression_JSON(t *testing.T) { t.Run(tt.name, func(t *testing.T) { alloc := memory.DefaultAllocator - expr := &physical.FunctionExpr{ - Op: types.FunctionOpParseJSON, + expr := &physical.VariadicExpr{ + Op: types.VariadicOpParseJSON, Expressions: []physical.Expression{ &physical.ColumnExpr{ Ref: semconv.ColumnIdentMessage.ColumnRef(), }, - &physical.NamedLiteralExpr{ - Name: types.ParseRequestedKeys, + &physical.LiteralExpr{ Literal: types.NewLiteral(tt.requestedKeys), }, }, @@ -1202,12 +1179,12 @@ func TestEvaluateParseExpression_JSON(t *testing.T) { e := newExpressionEvaluator() record := tt.input.Record(alloc, tt.schema) - colVec, err := e.eval(expr, record) + col, err := e.eval(expr, record) require.NoError(t, err) - id := colVec.Type().ArrowType().ID() + id := col.DataType().ID() require.Equal(t, arrow.STRUCT, id) - arr, ok := colVec.ToArray().(*array.Struct) + arr, ok := col.(*array.Struct) require.True(t, ok) require.Equal(t, tt.expectedFields, arr.NumField()) //value, error, errorDetails @@ -1224,8 +1201,8 @@ func TestEvaluateParseExpression_HandleMissingRequestedKeys(t *testing.T) { colMsg := "utf8.builtin.message" alloc := memory.DefaultAllocator - expr := &physical.FunctionExpr{ - Op: types.FunctionOpParseJSON, + expr := &physical.VariadicExpr{ + Op: types.VariadicOpParseJSON, Expressions: []physical.Expression{ &physical.ColumnExpr{ Ref: semconv.ColumnIdentMessage.ColumnRef(), @@ -1245,12 +1222,12 @@ func TestEvaluateParseExpression_HandleMissingRequestedKeys(t *testing.T) { }, nil) record := input.Record(alloc, schema) - colVec, err := e.eval(expr, record) + col, err := e.eval(expr, record) require.NoError(t, err) - id := colVec.Type().ArrowType().ID() + id := col.DataType().ID() require.Equal(t, arrow.STRUCT, id) - arr, ok := colVec.ToArray().(*array.Struct) + arr, ok := col.(*array.Struct) require.True(t, ok) require.Equal(t, 2, arr.NumField()) //level, status diff --git a/pkg/engine/internal/executor/functions.go b/pkg/engine/internal/executor/functions.go index 59c445b442b10..6a224b32707ab 100644 --- a/pkg/engine/internal/executor/functions.go +++ b/pkg/engine/internal/executor/functions.go @@ -15,9 +15,9 @@ import ( ) var ( - unaryFunctions UnaryFunctionRegistry = &unaryFuncReg{} - binaryFunctions BinaryFunctionRegistry = &binaryFuncReg{} - functions FunctionRegistry = &funcReg{} + unaryFunctions UnaryFunctionRegistry = &unaryFuncReg{} + binaryFunctions BinaryFunctionRegistry = &binaryFuncReg{} + variadicFunctions VariadicFunctionRegistry = &variadicFuncReg{} ) func init() { @@ -111,8 +111,8 @@ func init() { unaryFunctions.register(types.UnaryOpCastDuration, arrow.BinaryTypes.String, castFn(types.UnaryOpCastDuration)) // Parse functions - functions.register(types.FunctionOpParseLogfmt, parseFn(types.FunctionOpParseLogfmt)) - functions.register(types.FunctionOpParseJSON, parseFn(types.FunctionOpParseJSON)) + variadicFunctions.register(types.VariadicOpParseLogfmt, parseFn(types.VariadicOpParseLogfmt)) + variadicFunctions.register(types.VariadicOpParseJSON, parseFn(types.VariadicOpParseJSON)) } type UnaryFunctionRegistry interface { @@ -348,36 +348,41 @@ func boolToInt(b bool) int { return i } -type FunctionRegistry interface { - register(types.FunctionOp, Function) - GetForSignature(types.FunctionOp) (Function, error) +type VariadicFunctionRegistry interface { + register(types.VariadicOp, VariadicFunction) + GetForSignature(types.VariadicOp) (VariadicFunction, error) } -type Function interface { +type VariadicFunction interface { Evaluate(args ...arrow.Array) (arrow.Array, error) } -type FunctionFunc func(args ...arrow.Array) (arrow.Array, error) +type VariadicFunctionFunc func(args ...arrow.Array) (arrow.Array, error) -func (f FunctionFunc) Evaluate(args ...arrow.Array) (arrow.Array, error) { +func (f VariadicFunctionFunc) Evaluate(args ...arrow.Array) (arrow.Array, error) { return f(args...) } -type funcReg struct { - reg map[types.FunctionOp]Function +type variadicFuncReg struct { + reg map[types.VariadicOp]VariadicFunction } -// register implements UnaryFunctionRegistry. -func (u *funcReg) register(op types.FunctionOp, f Function) { +// register implements VariadicFunctionRegistry. +func (u *variadicFuncReg) register(op types.VariadicOp, f VariadicFunction) { if u.reg == nil { - u.reg = make(map[types.FunctionOp]Function) + u.reg = make(map[types.VariadicOp]VariadicFunction) } - // TODO(twhitney): Should the function panic when duplicate keys are registered? - u.reg[op] = f + + _, exists := u.reg[op] + if exists { + panic(fmt.Sprintf("duplicate variadic function registration for %s", op)) + } + + u.reg[op] = f } -// GetForSignature implements UnaryFunctionRegistry. -func (u *funcReg) GetForSignature(op types.FunctionOp) (Function, error) { +// GetForSignature implements VariadicFunctionRegistry. +func (u *variadicFuncReg) GetForSignature(op types.VariadicOp) (VariadicFunction, error) { // Get registered function for the specific operation fn, ok := u.reg[op] if !ok { diff --git a/pkg/engine/internal/executor/parse.go b/pkg/engine/internal/executor/parse.go index 0b55a417a4bbe..37561ac29209f 100644 --- a/pkg/engine/internal/executor/parse.go +++ b/pkg/engine/internal/executor/parse.go @@ -11,21 +11,22 @@ import ( "github.com/grafana/loki/v3/pkg/engine/internal/semconv" "github.com/grafana/loki/v3/pkg/engine/internal/types" + "github.com/grafana/loki/v3/pkg/engine/internal/util" ) -func parseFn(op types.FunctionOp) Function { - return FunctionFunc(func(args ...ColumnVector) (ColumnVector, error) { +func parseFn(op types.VariadicOp) VariadicFunction { + return VariadicFunctionFunc(func(args ...arrow.Array) (arrow.Array, error) { sourceCol, requestedKeys, err := extractParseFnParameters(args) if err != nil { - return nil, err + panic(err) } var headers []string var parsedColumns []arrow.Array switch op { - case types.FunctionOpParseLogfmt: + case types.VariadicOpParseLogfmt: headers, parsedColumns = buildLogfmtColumns(sourceCol, requestedKeys) - case types.FunctionOpParseJSON: + case types.VariadicOpParseJSON: headers, parsedColumns = buildJSONColumns(sourceCol, requestedKeys) default: return nil, fmt.Errorf("unsupported parser kind: %v", op) @@ -47,15 +48,11 @@ func parseFn(op types.FunctionOp) Function { return nil, err } - return &ArrayStruct{ - array: result, - ct: types.ColumnTypeParsed, - rows: int64(sourceCol.Len()), - }, nil + return result, nil }) } -func extractParseFnParameters(args []ColumnVector) (*array.String, []string, error) { +func extractParseFnParameters(args []arrow.Array) (*array.String, []string, error) { // Valid signatures: //parse(sourceColVec) //parse(sourceColVec, requestedKeys) @@ -63,31 +60,39 @@ func extractParseFnParameters(args []ColumnVector) (*array.String, []string, err return nil, nil, fmt.Errorf("parse function expected 1 or 2 arguments, got %d", len(args)) } - var sourceColVec, requestedKeysColVec ColumnVector - sourceColVec = args[0] + var sourceColArr, requestedKeysArr arrow.Array + sourceColArr = args[0] if len(args) == 2 { - requestedKeysColVec = args[1] + requestedKeysArr = args[1] } - if sourceColVec == nil { - return nil, nil, fmt.Errorf("parse function arguments did no include a source ColumnVector to parse") + if sourceColArr == nil { + return nil, nil, fmt.Errorf("parse function arguments did not include a source ColumnVector to parse") } - arr := sourceColVec.ToArray() - - sourceCol, ok := arr.(*array.String) + sourceCol, ok := sourceColArr.(*array.String) if !ok { - return nil, nil, fmt.Errorf("parse can only operate on string column types, got %T", arr) + return nil, nil, fmt.Errorf("parse can only operate on string column types, got %T", sourceColArr) } var requestedKeys []string - if requestedKeysColVec != nil { - reqKeysValue := requestedKeysColVec.Value(0) - requestedKeys, ok = reqKeysValue.([]string) - if !ok { - return nil, nil, fmt.Errorf("expected %s argument to be of type []string, got %T", types.ParseRequestedKeys, reqKeysValue) - } + + // Rquested keys will be the same for all rows, so we only need the first one + reqKeysIdx := 0 + if requestedKeysArr == nil || requestedKeysArr.IsNull(reqKeysIdx) { + return sourceCol, requestedKeys, nil + } + + reqKeysList, ok := requestedKeysArr.(*array.List) + if !ok { + return nil, nil, fmt.Errorf("requested keys must be a list of string arrays, got %T", requestedKeysArr) + } + + firstRow, ok := util.ArrayListValue(reqKeysList, reqKeysIdx).([]string) + if !ok { + return nil, nil, fmt.Errorf("requested keys must be a list of string arrays, got a list of %T", firstRow) } + requestedKeys = append(requestedKeys, firstRow...) return sourceCol, requestedKeys, nil } diff --git a/pkg/engine/internal/planner/logical/builder.go b/pkg/engine/internal/planner/logical/builder.go index eba6ad8a47eef..9ed5839ae88d4 100644 --- a/pkg/engine/internal/planner/logical/builder.go +++ b/pkg/engine/internal/planner/logical/builder.go @@ -40,7 +40,7 @@ func (b *Builder) Limit(skip uint32, fetch uint32) *Builder { } // Parse applies a [Parse] operation to the Builder. -func (b *Builder) Parse(op types.FunctionOp) *Builder { +func (b *Builder) Parse(op types.VariadicOp) *Builder { val := &FunctionOp{ Op: op, Values: []Value{ diff --git a/pkg/engine/internal/planner/logical/builder_convert.go b/pkg/engine/internal/planner/logical/builder_convert.go index 122274df142d7..f09fdbf128988 100644 --- a/pkg/engine/internal/planner/logical/builder_convert.go +++ b/pkg/engine/internal/planner/logical/builder_convert.go @@ -47,6 +47,8 @@ func (b *ssaBuilder) process(value Value) (Value, error) { return b.processRangeAggregate(value) case *VectorAggregation: return b.processVectorAggregation(value) + case *UnaryOp: + return b.processUnaryOp(value) case *BinOp: return b.processBinOp(value) case *ColumnRef: diff --git a/pkg/engine/internal/planner/logical/function_op.go b/pkg/engine/internal/planner/logical/function_op.go index 2c399aaae1c50..3c3a363b6ba39 100644 --- a/pkg/engine/internal/planner/logical/function_op.go +++ b/pkg/engine/internal/planner/logical/function_op.go @@ -12,7 +12,7 @@ import ( type FunctionOp struct { id string - Op types.FunctionOp + Op types.VariadicOp Values []Value } diff --git a/pkg/engine/internal/planner/logical/planner.go b/pkg/engine/internal/planner/logical/planner.go index 11bb583187f1c..eea821e163f5b 100644 --- a/pkg/engine/internal/planner/logical/planner.go +++ b/pkg/engine/internal/planner/logical/planner.go @@ -190,10 +190,10 @@ func buildPlanForLogQuery( // for example, the query `{app="foo"} | json | line_format "{{.nested_json}}" | json ` is valid, and will need // multiple parse stages. We will handle this in a future PR. if hasLogfmtParser { - builder = builder.Parse(types.FunctionOpParseLogfmt) + builder = builder.Parse(types.VariadicOpParseLogfmt) } if hasJSONParser { - builder = builder.Parse(types.FunctionOpParseJSON) + builder = builder.Parse(types.VariadicOpParseJSON) } for _, value := range postParsePredicates { builder = builder.Select(value) diff --git a/pkg/engine/internal/planner/physical/expressions.go b/pkg/engine/internal/planner/physical/expressions.go index dff385f00dc58..c9d8537d3400a 100644 --- a/pkg/engine/internal/planner/physical/expressions.go +++ b/pkg/engine/internal/planner/physical/expressions.go @@ -15,7 +15,7 @@ const ( ExprTypeUnary ExprTypeBinary - ExprTypeFunction + ExprTypeVariadic ExprTypeLiteral ExprTypeColumn ) @@ -27,8 +27,8 @@ func (t ExpressionType) String() string { return "UnaryExpression" case ExprTypeBinary: return "BinaryExpression" - case ExprTypeFunction: - return "FunctionExpression" + case ExprTypeVariadic: + return "VariadicExpression" case ExprTypeLiteral: return "LiteralExpression" case ExprTypeColumn: @@ -214,32 +214,27 @@ func (e *ColumnExpr) Type() ExpressionType { return ExprTypeColumn } -// FunctionExpr is an expression that implements the [FunctionExpression] interface. -type FunctionExpr struct { +// VariadicExpr is an expression that implements the [FunctionExpression] interface. +type VariadicExpr struct { // Op is the function operation to apply to the parameters - Op types.FunctionOp + Op types.VariadicOp // Expressions are the parameters paaws to the function Expressions []Expression } -func (*FunctionExpr) isExpr() {} -func (*FunctionExpr) isFunctionExpr() {} +func (*VariadicExpr) isExpr() {} +func (*VariadicExpr) isFunctionExpr() {} -// Clone returns a copy of the [FunctionExpr]. -func (e *FunctionExpr) Clone() Expression { - params := make([]Expression, len(e.Expressions)) - for i, param := range e.Expressions { - params[i] = param.Clone() - } - - return &FunctionExpr{ - Expressions: params, +// Clone returns a copy of the [VariadicExpr]. +func (e *VariadicExpr) Clone() Expression { + return &VariadicExpr{ + Expressions: cloneExpressions(e.Expressions), Op: e.Op, } } -func (e *FunctionExpr) String() string { +func (e *VariadicExpr) String() string { exprs := make([]string, len(e.Expressions)) for i, expr := range e.Expressions { exprs[i] = expr.String() @@ -247,36 +242,7 @@ func (e *FunctionExpr) String() string { return fmt.Sprintf("%s(%s)", e.Op, strings.Join(exprs, ", ")) } -// Type returns the type of the [FunctionExpr]. -func (*FunctionExpr) Type() ExpressionType { - return ExprTypeFunction -} - -type NamedLiteralExpr struct { - types.Literal - Name string -} - -func (*NamedLiteralExpr) isExpr() {} -func (*NamedLiteralExpr) isLiteralExpr() {} - -// Clone returns a copy of the [NamedLiteralExpr]. -func (e *NamedLiteralExpr) Clone() Expression { - // No need to clone literals. - return &NamedLiteralExpr{Literal: e.Literal, Name: e.Name} -} - -// String returns the string representation of the literal value. -func (e *NamedLiteralExpr) String() string { - return fmt.Sprintf("%s: %s", e.Name, e.Literal.String()) -} - -// Type returns the type of the [FuntionLiteralParameterExpr]. -func (*NamedLiteralExpr) Type() ExpressionType { - return ExprTypeLiteral -} - -// ValueType returns the kind of value represented by the literal. -func (e *NamedLiteralExpr) ValueType() types.DataType { - return e.Literal.Type() +// Type returns the type of the [VariadicExpr]. +func (*VariadicExpr) Type() ExpressionType { + return ExprTypeVariadic } diff --git a/pkg/engine/internal/planner/physical/optimizer.go b/pkg/engine/internal/planner/physical/optimizer.go index 6bcf33802940d..2d978efc56391 100644 --- a/pkg/engine/internal/planner/physical/optimizer.go +++ b/pkg/engine/internal/planner/physical/optimizer.go @@ -1,6 +1,7 @@ package physical import ( + "fmt" "maps" "slices" "sort" @@ -268,16 +269,6 @@ func (r *projectionPushdown) propagateProjections(node Node, projections []Colum extracted := extractColumnsFromPredicates(node.Predicates) projections = append(projections, extracted...) - case *Proejction: - // Projections are a special case. It is both a target for and a source of projections. - // [Target] Operations may take columns as arguments, such as requested keys for parse.. - // [Source] Operations may contain columns to append, such as builtin message column for parse. - var projectionNodeChanged bool - projectionNodeChanged, projections = r.handleProjection(node, projections) - if projectionNodeChanged { - changed = true - } - case *ScanSet: // [Target] ScanSet - projections are applied here. return r.handleScanSet(node, projections) @@ -287,16 +278,25 @@ func (r *projectionPushdown) propagateProjections(node Node, projections []Colum return r.handleDataobjScan(node, projections) case *Projection: - if node.Expand { - // [Source] column referred by unwrap. - for _, e := range node.Expressions { - e, isUnary := e.(*UnaryExpr) - if isUnary && slices.Contains([]types.UnaryOp{types.UnaryOpCastFloat, types.UnaryOpCastBytes, types.UnaryOpCastDuration}, e.Op) { + // Projections are a special case. It is both a target for and a source of projections. + // [Target] Operations may take columns as arguments, such as requested keys for parse.. + // [Source] Operations may contain columns to append, such as builtin message column for parse or source column for unwrap. + for _, e := range node.Expressions { + switch e := e.(type) { + case *UnaryExpr: + if slices.Contains([]types.UnaryOp{types.UnaryOpCastFloat, types.UnaryOpCastBytes, types.UnaryOpCastDuration}, e.Op) { projections = append(projections, e.Left.(ColumnExpression)) } + case *VariadicExpr: + if e.Op == types.VariadicOpParseJSON || e.Op == types.VariadicOpParseLogfmt { + projectionNodeChanged, projsToPropagate := r.handleParse(e, projections) + projections = append(projections, projsToPropagate...) + if projectionNodeChanged { + changed = true + } + } } } - default: // propagate to children } @@ -371,50 +371,19 @@ func (r *projectionPushdown) handleDataobjScan(node *DataObjScan, projections [] return changed } -// handleProjection handles projection pushdown for expressions on Projection nodes -func (r *projectionPushdown) handleProjection(node *Projection, projections []ColumnExpression, applyIfNotEmpty bool) (bool, []ColumnExpression) { - anyChanged := false - for _, e := range node.Expressions { - switch e := e.(type) { - case *FunctionExpr: - if e.Op == types.FunctionOpParseJSON || e.Op == types.FunctionOpParseLogfmt { - return r.handleParse(e, projections, applyIfNotEmpty) - } - } - } - - return anyChanged -} - -func (r *projectionPushdown) handleParse(expr *FunctionExpr, projections []ColumnExpression, applyIfNotEmpty bool) (bool, []ColumnExpression) { +func (r *projectionPushdown) handleParse(expr *VariadicExpr, projections []ColumnExpression) (bool, []ColumnExpression) { _, ambiguousProjections := disambiguateColumns(projections) - var requestedKeysExpr *NamedLiteralExpr - for _, e := range expr.Expressions { - switch e := e.(type) { - case *NamedLiteralExpr: - if e.Name != types.ParseRequestedKeys { - continue - } - - requestedKeysExpr = e - } + var exprs parseExprs + if err := exprs.Unpack(expr.Expressions); err != nil { + panic(err) } - if requestedKeysExpr == nil { - requestedKeysExpr = &NamedLiteralExpr{ - Name: types.ParseRequestedKeys, - Literal: types.NewLiteral([]string{}), - } - expr.Expressions = append(expr.Expressions, requestedKeysExpr) - } - - existingKeys, ok := requestedKeysExpr.Literal.(types.StringListLiteral) + existingKeys, ok := exprs.requestedKeysExpr.Literal.(types.StringListLiteral) if !ok { - return nil, false + panic(fmt.Errorf("expected requested keys to be a list of strings, got %T", exprs.requestedKeysExpr.Literal)) } - // Found a ParseNode - update its keys requestedKeys := make(map[string]bool) for _, k := range existingKeys { requestedKeys[k] = true @@ -437,16 +406,67 @@ func (r *projectionPushdown) handleParse(expr *FunctionExpr, projections []Colum // Convert back to sorted slice newKeys := slices.Collect(maps.Keys(requestedKeys)) sort.Strings(newKeys) - requestedKeysExpr.Literal = types.NewLiteral(newKeys) + exprs.requestedKeysExpr.Literal = types.NewLiteral(newKeys) } - projections = append(projections, &ColumnExpr{ - Ref: types.ColumnRef{Column: types.ColumnNameBuiltinMessage, Type: types.ColumnTypeBuiltin}, - }) - + expr.Expressions = exprs.Pack(expr.Expressions) + projections = append(projections, exprs.sourceColumnExpr) return changed, projections } +// parseExprs is a helper struct for unpacking and packing parse arguments from generic expressions. +type parseExprs struct { + sourceColumnExpr *ColumnExpr + requestedKeysExpr *LiteralExpr +} + +// Unpack unpacks the given expressions into valid expressions for parse. +// Valid expressions for parse are ones that will evaluate into valid arguments for a [parseFn]. +// The valid signatures for a [parseFn] are: +// parseFn(sourceCol [arrow.Array]) +// parseFn(sourceCol [arrow.Array], requestedKeys [arrow.Array]). +// +// Therefore the valid exprssions are (order matters): +// [sourceColExpr *ColumnExpr] -> parseFn(sourceColVec arrow.Array) +// [sourceColExpr *ColumnExpr, requestedKeysExpr *LiteralExpr] -> parseFn(sourceColVec arrow.Array, requestedKeys arrow.Array) +func (a *parseExprs) Unpack(exprs []Expression) error { + if len(exprs) < 1 || len(exprs) > 2 { + return fmt.Errorf("expected to unpack 1 or 2 expressions, got %d", len(exprs)) + } + + var ok bool + a.sourceColumnExpr, ok = exprs[0].(*ColumnExpr) + if !ok { + return fmt.Errorf("expected source column to be a column expression, got %T", exprs[0]) + } + + if len(exprs) == 2 { + a.requestedKeysExpr, ok = exprs[1].(*LiteralExpr) + if !ok { + return fmt.Errorf("expected requested keys to be a literal expression, got %T", exprs[1]) + } + } else { + a.requestedKeysExpr = &LiteralExpr{Literal: types.NewLiteral([]string{})} + } + + return nil +} + +// Pack packs parse specific expressions back into generic expressions. +// It will resues [dst] if has enough capacity, otherwise it will allocate a new slice. +func (a *parseExprs) Pack(dst []Expression) []Expression { + if cap(dst) >= 2 { + dst = dst[:2] + } else { + dst = make([]Expression, 2) + } + + // order matters + dst[0] = a.sourceColumnExpr + dst[1] = a.requestedKeysExpr + return dst +} + func sortProjections(a, b ColumnExpression) int { exprA, aOk := a.(*ColumnExpr) exprB, bOk := b.(*ColumnExpr) diff --git a/pkg/engine/internal/planner/physical/optimizer_test.go b/pkg/engine/internal/planner/physical/optimizer_test.go index cef0e4de8c734..8f09c3175c3b7 100644 --- a/pkg/engine/internal/planner/physical/optimizer_test.go +++ b/pkg/engine/internal/planner/physical/optimizer_test.go @@ -623,7 +623,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { }) // Add parse but no filters requiring parsed fields - builder = builder.Parse(types.FunctionOpParseLogfmt) + builder = builder.Parse(types.VariadicOpParseLogfmt) return builder.Value() }, }, @@ -643,7 +643,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { }) // Don't set RequestedKeys here - optimization should determine them - builder = builder.Parse(logical.ParserLogfmt) + builder = builder.Parse(types.VariadicOpParseLogfmt) // Add filter with ambiguous column filterExpr := &logical.BinOp{ @@ -670,7 +670,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { Shard: logical.NewShard(0, 1), }) - builder = builder.Parse(types.FunctionOpParseLogfmt) + builder = builder.Parse(types.VariadicOpParseLogfmt) // Add filter on label column (should be skipped) labelFilter := &logical.BinOp{ @@ -714,7 +714,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { Shard: logical.NewShard(0, 1), }) - builder = builder.Parse(types.FunctionOpParseLogfmt) + builder = builder.Parse(types.VariadicOpParseLogfmt) // Range aggregation with PartitionBy builder = builder.RangeAggregation( @@ -749,7 +749,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { }) // Don't set RequestedKeys here - optimization should determine them - builder = builder.Parse(types.FunctionOpParseLogfmt) + builder = builder.Parse(types.VariadicOpParseLogfmt) // Add filter with ambiguous column filterExpr := &logical.BinOp{ @@ -798,7 +798,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { }) // Add parse without specifying RequestedKeys - builder = builder.Parse(types.FunctionOpParseLogfmt) + builder = builder.Parse(types.VariadicOpParseLogfmt) // Add filter on ambiguous column filterExpr := &logical.BinOp{ @@ -861,17 +861,14 @@ func TestProjectionPushdown_PushesRequestedKeysToParseOperations(t *testing.T) { } require.NotNil(t, projectionNode, "Projection not found in plan") - var requestedKeys *NamedLiteralExpr + var requestedKeys *LiteralExpr for _, expr := range projectionNode.Expressions { switch expr := expr.(type) { - case *FunctionExpr: + case *VariadicExpr: for _, e := range expr.Expressions { switch e := e.(type) { - case *NamedLiteralExpr: - if e.Name == types.ParseRequestedKeys { - requestedKeys = e - break - } + case *LiteralExpr: + requestedKeys = e } } } diff --git a/pkg/engine/internal/planner/physical/planner.go b/pkg/engine/internal/planner/physical/planner.go index 5982d872a8afb..2ef21fb3528c7 100644 --- a/pkg/engine/internal/planner/physical/planner.go +++ b/pkg/engine/internal/planner/physical/planner.go @@ -133,7 +133,7 @@ func (p *Planner) convertPredicate(inst logical.Value) Expression { for i, v := range inst.Values { exprs[i] = p.convertPredicate(v) } - return &FunctionExpr{ + return &VariadicExpr{ Op: inst.Op, Expressions: exprs, } diff --git a/pkg/engine/internal/planner/physical/planner_test.go b/pkg/engine/internal/planner/physical/planner_test.go index 072bbf25129d9..cd0abee45a33c 100644 --- a/pkg/engine/internal/planner/physical/planner_test.go +++ b/pkg/engine/internal/planner/physical/planner_test.go @@ -270,7 +270,7 @@ func TestPlanner_Convert_WithParse(t *testing.T) { Shard: logical.NewShard(0, 1), }, ).Parse( - types.FunctionOpParseLogfmt, + types.VariadicOpParseLogfmt, ).Select( &logical.BinOp{ Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), @@ -308,9 +308,9 @@ func TestPlanner_Convert_WithParse(t *testing.T) { require.True(t, ok, "Filter's child should be Projection") require.Len(t, projectionNode.Expressions, 1) - expr, ok := projectionNode.Expressions[0].(*FunctionExpr) + expr, ok := projectionNode.Expressions[0].(*VariadicExpr) require.True(t, ok) - require.Equal(t, types.FunctionOpParseLogfmt, expr.Op) + require.Equal(t, types.VariadicOpParseLogfmt, expr.Op) funcArgs := expr.Expressions require.Len(t, funcArgs, 1) @@ -349,7 +349,7 @@ func TestPlanner_Convert_WithParse(t *testing.T) { Shard: logical.NewShard(0, 1), }, ).Parse( - types.FunctionOpParseLogfmt, + types.VariadicOpParseLogfmt, ).Select( &logical.BinOp{ Left: logical.NewColumnRef("level", types.ColumnTypeAmbiguous), @@ -400,9 +400,9 @@ func TestPlanner_Convert_WithParse(t *testing.T) { require.True(t, ok, "Filter's child should be Projection") require.Len(t, projectionNode.Expressions, 1) - expr, ok := projectionNode.Expressions[0].(*FunctionExpr) + expr, ok := projectionNode.Expressions[0].(*VariadicExpr) require.True(t, ok) - require.Equal(t, types.FunctionOpParseLogfmt, expr.Op) + require.Equal(t, types.VariadicOpParseLogfmt, expr.Op) funcArgs := expr.Expressions require.Len(t, funcArgs, 1) @@ -424,9 +424,8 @@ func TestPlanner_Convert_WithParse(t *testing.T) { require.Equal(t, types.ColumnNameBuiltinMessage, sourcCol.Ref.Column) require.Equal(t, types.ColumnTypeBuiltin, sourcCol.Ref.Type) - reqKeys, ok := funcArgs[1].(*NamedLiteralExpr) + reqKeys, ok := funcArgs[1].(*LiteralExpr) require.True(t, ok) - require.Equal(t, types.ParseRequestedKeys, reqKeys.Name) keys, ok := reqKeys.Literal.(types.StringListLiteral) require.True(t, ok) diff --git a/pkg/engine/internal/types/arrow.go b/pkg/engine/internal/types/arrow.go index 50aff9530a554..920ca9f34714a 100644 --- a/pkg/engine/internal/types/arrow.go +++ b/pkg/engine/internal/types/arrow.go @@ -13,6 +13,7 @@ var ( Duration DataType Bytes DataType Struct DataType + List DataType }{ Null: tNull{}, Bool: tBool{}, @@ -23,6 +24,7 @@ var ( Duration: tDuration{}, Bytes: tBytes{}, Struct: tStruct{}, + List: tList{}, } Arrow = struct { diff --git a/pkg/engine/internal/types/column.go b/pkg/engine/internal/types/column.go index f7b60e0ca35ca..cfd2caf143e5a 100644 --- a/pkg/engine/internal/types/column.go +++ b/pkg/engine/internal/types/column.go @@ -54,11 +54,6 @@ const ( SampleExtractionErrorType = "SampleExtractionErr" ) -// Named function arguments -const ( - ParseRequestedKeys = "requestedKeys" -) - var ctNames = [7]string{"invalid", "builtin", "label", "metadata", "parsed", "ambiguous", "generated"} // String returns a human-readable representation of the column type. diff --git a/pkg/engine/internal/types/operators.go b/pkg/engine/internal/types/operators.go index 8eb17aa04a868..127509ca06899 100644 --- a/pkg/engine/internal/types/operators.go +++ b/pkg/engine/internal/types/operators.go @@ -124,26 +124,26 @@ func (t BinaryOp) String() string { } } -// FunctionOp denotes the kind of [FunctionOp] operation to perform. -type FunctionOp uint32 +// VariadicOp denotes the kind of [VariadicOp] operation to perform. +type VariadicOp uint32 -// Recognized values of [FunctionOp]. +// Recognized values of [VariadicOp]. const ( // VariadicOpKindInvalid indicates an invalid unary operation. - FunctionOpInvalid FunctionOp = iota + VariadicOpInvalid VariadicOp = iota - FunctionOpParseLogfmt // Parse logfmt line to set of columns operation (logfmt). - FunctionOpParseJSON // Parse JSON line to set of columns operation (json). + VariadicOpParseLogfmt // Parse logfmt line to set of columns operation (logfmt). + VariadicOpParseJSON // Parse JSON line to set of columns operation (json). ) // String returns the string representation of the UnaryOp. -func (t FunctionOp) String() string { +func (t VariadicOp) String() string { switch t { - case FunctionOpParseLogfmt: + case VariadicOpParseLogfmt: return "PARSE_LOGFMT" - case FunctionOpParseJSON: + case VariadicOpParseJSON: return "PARSE_JSON" default: - panic(fmt.Sprintf("unknown unary operator %d", t)) + panic(fmt.Sprintf("unknown variadic operator %d", t)) } } diff --git a/pkg/engine/internal/types/types.go b/pkg/engine/internal/types/types.go index 7ba8bc6fc2f90..84776b0409c95 100644 --- a/pkg/engine/internal/types/types.go +++ b/pkg/engine/internal/types/types.go @@ -39,6 +39,8 @@ func (t Type) String() string { return "TIMESTAMP" case STRUCT: return "STRUCT" + case LIST: + return "LIST" default: return "INVALID" } @@ -130,6 +132,7 @@ var ( Loki.Duration.String(): Loki.Duration, Loki.Bytes.String(): Loki.Bytes, Loki.Struct.String(): Loki.Struct, + Loki.List.String(): Loki.List, } ) diff --git a/pkg/engine/internal/util/array_list.go b/pkg/engine/internal/util/array_list.go new file mode 100644 index 0000000000000..fdba7b5b0b29e --- /dev/null +++ b/pkg/engine/internal/util/array_list.go @@ -0,0 +1,21 @@ +package util + +import ( + "github.com/apache/arrow-go/v18/arrow/array" +) + +func ArrayListValue(arr *array.List, i int) any { + start, end := arr.ValueOffsets(i) + listValues := arr.ListValues() + switch listValues := listValues.(type) { + case *array.String: + result := make([]string, end-start) + for i := start; i < end; i++ { + result[i-start] = listValues.Value(int(i)) + } + return result + } + + return nil + +} diff --git a/pkg/engine/internal/workflow/workflow_planner_test.go b/pkg/engine/internal/workflow/workflow_planner_test.go index 3dc2bf2a2302a..e02b5a254a42c 100644 --- a/pkg/engine/internal/workflow/workflow_planner_test.go +++ b/pkg/engine/internal/workflow/workflow_planner_test.go @@ -132,8 +132,8 @@ RangeAggregation operation=invalid start=0001-01-01T00:00:00Z end=0001-01-01T00: project = physicalGraph.Add( &physical.Projection{ Expressions: []physical.Expression{ - &physical.FunctionExpr{ - Op: types.FunctionOpParseLogfmt, + &physical.VariadicExpr{ + Op: types.VariadicOpParseLogfmt, Expressions: []physical.Expression{ &physical.ColumnExpr{ Ref: semconv.ColumnIdentMessage.ColumnRef(), From 06af3bd83d12d6d184793f10798b5e28515e528c Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Mon, 27 Oct 2025 10:10:08 -0600 Subject: [PATCH 04/12] test: fix planner_test.go --- pkg/engine/internal/planner/planner_test.go | 53 ++++++++++----------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/pkg/engine/internal/planner/planner_test.go b/pkg/engine/internal/planner/planner_test.go index b942dd3fde4de..e8c3ab4b475fa 100644 --- a/pkg/engine/internal/planner/planner_test.go +++ b/pkg/engine/internal/planner/planner_test.go @@ -186,12 +186,11 @@ Limit offset=0 limit=1000 └── Parallelize └── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 └── Filter predicate[0]=EQ(ambiguous.level, "error") - └── Compat src=parsed dst=parsed collision=label - └── Parse kind=logfmt - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) predicate[2]=MATCH_STR(builtin.message, "bar") - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) predicate[2]=MATCH_STR(builtin.message, "bar") + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, { @@ -203,12 +202,11 @@ Limit offset=0 limit=1000 └── Parallelize └── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 └── Projection all=true drop=(ambiguous.service_name, ambiguous.__error__) - └── Compat src=parsed dst=parsed collision=label - └── Parse kind=logfmt - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, { @@ -224,12 +222,11 @@ VectorAggregation operation=sum group_by=(ambiguous.bar) └── Parallelize └── Projection all=true expand=(CAST_DURATION(ambiguous.request_duration)) └── Filter predicate[0]=NEQ(ambiguous.request_duration, "") - └── Compat src=parsed dst=parsed collision=label - └── Parse kind=logfmt requested_keys=(bar, request_duration) - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 projections=(ambiguous.bar, builtin.message, ambiguous.request_duration, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [bar, request_duration])) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 projections=(ambiguous.bar, builtin.message, ambiguous.request_duration, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, { @@ -240,15 +237,13 @@ VectorAggregation operation=sum └── RangeAggregation operation=count start=2025-01-01T00:00:00Z end=2025-01-01T01:00:00Z step=0s range=1m0s └── Parallelize └── Projection all=true drop=(ambiguous.__error__, ambiguous.__error_details__) - └── Compat src=parsed dst=parsed collision=label - └── Parse kind=json - └── Compat src=parsed dst=parsed collision=label - └── Parse kind=logfmt - └── Filter predicate[0]=EQ(ambiguous.detected_level, "error") - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 projections=(ambiguous.detected_level, builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Projection all=true expand=(PARSE_JSON(builtin.message, [])) + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [])) + └── Filter predicate[0]=EQ(ambiguous.detected_level, "error") + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 projections=(ambiguous.detected_level, builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, @@ -285,10 +280,10 @@ Limit offset=0 limit=1000 comment: "parse logfmt with grouping", query: `sum by (bar) (count_over_time({app="foo"} | logfmt[1m]))`, expected: ` -VectorAggregation +VectorAggregation operation=sum group_by=(ambiguous.bar) └── RangeAggregation operation=count start=2025-01-01T00:00:00Z end=2025-01-01T01:00:00Z step=0s range=1m0s partition_by=(ambiguous.bar) └── Parallelize - └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, requestedKeys: [bar])) + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [bar])) └── Compat src=metadata dst=metadata collision=label └── ScanSet num_targets=2 projections=(builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() From 2bf61865f3c31215002698cd816d3c12f87b5079 Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Mon, 27 Oct 2025 11:35:32 -0600 Subject: [PATCH 05/12] chore: format --- pkg/engine/internal/executor/functions.go | 10 +++++----- pkg/engine/internal/types/types.go | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/engine/internal/executor/functions.go b/pkg/engine/internal/executor/functions.go index 6a224b32707ab..166b257d6316f 100644 --- a/pkg/engine/internal/executor/functions.go +++ b/pkg/engine/internal/executor/functions.go @@ -373,12 +373,12 @@ func (u *variadicFuncReg) register(op types.VariadicOp, f VariadicFunction) { u.reg = make(map[types.VariadicOp]VariadicFunction) } - _, exists := u.reg[op] - if exists { - panic(fmt.Sprintf("duplicate variadic function registration for %s", op)) - } + _, exists := u.reg[op] + if exists { + panic(fmt.Sprintf("duplicate variadic function registration for %s", op)) + } - u.reg[op] = f + u.reg[op] = f } // GetForSignature implements VariadicFunctionRegistry. diff --git a/pkg/engine/internal/types/types.go b/pkg/engine/internal/types/types.go index 84776b0409c95..37eff8b89135c 100644 --- a/pkg/engine/internal/types/types.go +++ b/pkg/engine/internal/types/types.go @@ -40,7 +40,7 @@ func (t Type) String() string { case STRUCT: return "STRUCT" case LIST: - return "LIST" + return "LIST" default: return "INVALID" } @@ -132,7 +132,7 @@ var ( Loki.Duration.String(): Loki.Duration, Loki.Bytes.String(): Loki.Bytes, Loki.Struct.String(): Loki.Struct, - Loki.List.String(): Loki.List, + Loki.List.String(): Loki.List, } ) From 52d6d5ead228b07c784047d48e254e47a262a082 Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Mon, 27 Oct 2025 15:03:46 -0600 Subject: [PATCH 06/12] chore: get off my lawn revive --- pkg/engine/internal/util/array_list.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/engine/internal/util/array_list.go b/pkg/engine/internal/util/array_list.go index fdba7b5b0b29e..6eea899d3ecaa 100644 --- a/pkg/engine/internal/util/array_list.go +++ b/pkg/engine/internal/util/array_list.go @@ -1,4 +1,4 @@ -package util +package util //nolint:revive import ( "github.com/apache/arrow-go/v18/arrow/array" From 9901beb6e6c565e3b512b337c8fcb02597b9156a Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Thu, 30 Oct 2025 17:23:49 -0600 Subject: [PATCH 07/12] Update pkg/engine/internal/planner/physical/planner.go Co-authored-by: Christian Haudum Signed-off-by: Trevor Whitney --- pkg/engine/internal/planner/physical/planner.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/engine/internal/planner/physical/planner.go b/pkg/engine/internal/planner/physical/planner.go index 2ef21fb3528c7..01b37b60cfb7f 100644 --- a/pkg/engine/internal/planner/physical/planner.go +++ b/pkg/engine/internal/planner/physical/planner.go @@ -133,10 +133,22 @@ func (p *Planner) convertPredicate(inst logical.Value) Expression { for i, v := range inst.Values { exprs[i] = p.convertPredicate(v) } - return &VariadicExpr{ + node := &VariadicExpr{ Op: inst.Op, Expressions: exprs, } + if p.context.v1Compatible { + compat := &ColumnCompat{ + Source: types.ColumnTypeParsed, + Destination: types.ColumnTypeParsed, + Collision: types.ColumnTypeLabel, + } + node, err = p.wrapNodeWith(node, compat) + if err != nil { + return nil, err + } + } + return node, nil default: panic(fmt.Sprintf("invalid value for predicate: %T", inst)) } From 6bb7553fe1d9d4b6c9372f315c3a2937e5fce6b9 Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Thu, 30 Oct 2025 17:24:02 -0600 Subject: [PATCH 08/12] Update pkg/engine/internal/executor/parse.go Co-authored-by: Christian Haudum Signed-off-by: Trevor Whitney --- pkg/engine/internal/executor/parse.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/engine/internal/executor/parse.go b/pkg/engine/internal/executor/parse.go index 37561ac29209f..22a19093063fb 100644 --- a/pkg/engine/internal/executor/parse.go +++ b/pkg/engine/internal/executor/parse.go @@ -43,10 +43,7 @@ func parseFn(op types.VariadicOp) VariadicFunction { newFields = append(newFields, semconv.FieldFromIdent(ident, true)) } - result, err := array.NewStructArrayWithFields(parsedColumns, newFields) - if err != nil { - return nil, err - } + return array.NewStructArrayWithFields(parsedColumns, newFields) return result, nil }) From 97fff6ff2113ecf4c1f9015b684937071ad87210 Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Thu, 30 Oct 2025 20:43:33 -0600 Subject: [PATCH 09/12] chore: clean up suggestions and merge conflicts --- pkg/engine/internal/executor/parse.go | 4 +- .../internal/planner/logical/function_op.go | 4 +- .../internal/planner/physical/optimizer.go | 1 + .../internal/planner/physical/planner.go | 35 +++++---- pkg/engine/internal/planner/planner_test.go | 71 ++++++++++--------- pkg/engine/internal/util/dag/dag.go | 4 +- 6 files changed, 67 insertions(+), 52 deletions(-) diff --git a/pkg/engine/internal/executor/parse.go b/pkg/engine/internal/executor/parse.go index 22a19093063fb..e804b9bede1b8 100644 --- a/pkg/engine/internal/executor/parse.go +++ b/pkg/engine/internal/executor/parse.go @@ -43,9 +43,7 @@ func parseFn(op types.VariadicOp) VariadicFunction { newFields = append(newFields, semconv.FieldFromIdent(ident, true)) } - return array.NewStructArrayWithFields(parsedColumns, newFields) - - return result, nil + return array.NewStructArrayWithFields(parsedColumns, newFields) }) } diff --git a/pkg/engine/internal/planner/logical/function_op.go b/pkg/engine/internal/planner/logical/function_op.go index 3c3a363b6ba39..ffbbb4d63e2b7 100644 --- a/pkg/engine/internal/planner/logical/function_op.go +++ b/pkg/engine/internal/planner/logical/function_op.go @@ -17,8 +17,8 @@ type FunctionOp struct { } var ( - _ Value = (*UnaryOp)(nil) - _ Instruction = (*UnaryOp)(nil) + _ Value = (*FunctionOp)(nil) + _ Instruction = (*FunctionOp)(nil) ) // Name returns an identifier for the UnaryOp operation. diff --git a/pkg/engine/internal/planner/physical/optimizer.go b/pkg/engine/internal/planner/physical/optimizer.go index 2d978efc56391..a738be40f7715 100644 --- a/pkg/engine/internal/planner/physical/optimizer.go +++ b/pkg/engine/internal/planner/physical/optimizer.go @@ -297,6 +297,7 @@ func (r *projectionPushdown) propagateProjections(node Node, projections []Colum } } } + default: // propagate to children } diff --git a/pkg/engine/internal/planner/physical/planner.go b/pkg/engine/internal/planner/physical/planner.go index 01b37b60cfb7f..f77efcaddeed6 100644 --- a/pkg/engine/internal/planner/physical/planner.go +++ b/pkg/engine/internal/planner/physical/planner.go @@ -137,18 +137,7 @@ func (p *Planner) convertPredicate(inst logical.Value) Expression { Op: inst.Op, Expressions: exprs, } - if p.context.v1Compatible { - compat := &ColumnCompat{ - Source: types.ColumnTypeParsed, - Destination: types.ColumnTypeParsed, - Collision: types.ColumnTypeLabel, - } - node, err = p.wrapNodeWith(node, compat) - if err != nil { - return nil, err - } - } - return node, nil + return node default: panic(fmt.Sprintf("invalid value for predicate: %T", inst)) } @@ -301,11 +290,18 @@ func (p *Planner) processSort(lp *logical.Sort, ctx *Context) (Node, error) { // Converts a [logical.Projection] into a physical [Projection] node. func (p *Planner) processProjection(lp *logical.Projection, ctx *Context) (Node, error) { expressions := make([]Expression, len(lp.Expressions)) + needsCompat := false for i := range lp.Expressions { expressions[i] = p.convertPredicate(lp.Expressions[i]) + if funcExpr, ok := lp.Expressions[i].(*logical.FunctionOp); ok { + if funcExpr.Op == types.VariadicOpParseJSON || funcExpr.Op == types.VariadicOpParseLogfmt { + needsCompat = true + } + } } - node := &Projection{ + var node Node + node = &Projection{ Expressions: expressions, All: lp.All, Expand: lp.Expand, @@ -313,6 +309,19 @@ func (p *Planner) processProjection(lp *logical.Projection, ctx *Context) (Node, } p.plan.graph.Add(node) + if needsCompat && p.context.v1Compatible { + compat := &ColumnCompat{ + Source: types.ColumnTypeParsed, + Destination: types.ColumnTypeParsed, + Collision: types.ColumnTypeLabel, + } + var err error + node, err = p.wrapNodeWith(node, compat) + if err != nil { + return nil, err + } + } + child, err := p.process(lp.Relation, ctx) if err != nil { return nil, err diff --git a/pkg/engine/internal/planner/planner_test.go b/pkg/engine/internal/planner/planner_test.go index e8c3ab4b475fa..cf81c8491e456 100644 --- a/pkg/engine/internal/planner/planner_test.go +++ b/pkg/engine/internal/planner/planner_test.go @@ -186,11 +186,12 @@ Limit offset=0 limit=1000 └── Parallelize └── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 └── Filter predicate[0]=EQ(ambiguous.level, "error") - └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) predicate[2]=MATCH_STR(builtin.message, "bar") - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Compat src=parsed dst=parsed collision=label + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) predicate[2]=MATCH_STR(builtin.message, "bar") + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, { @@ -202,11 +203,12 @@ Limit offset=0 limit=1000 └── Parallelize └── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 └── Projection all=true drop=(ambiguous.service_name, ambiguous.__error__) - └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Compat src=parsed dst=parsed collision=label + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, { @@ -222,11 +224,12 @@ VectorAggregation operation=sum group_by=(ambiguous.bar) └── Parallelize └── Projection all=true expand=(CAST_DURATION(ambiguous.request_duration)) └── Filter predicate[0]=NEQ(ambiguous.request_duration, "") - └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [bar, request_duration])) - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 projections=(ambiguous.bar, builtin.message, ambiguous.request_duration, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Compat src=parsed dst=parsed collision=label + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [bar, request_duration])) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 projections=(ambiguous.bar, builtin.message, ambiguous.request_duration, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, { @@ -237,13 +240,15 @@ VectorAggregation operation=sum └── RangeAggregation operation=count start=2025-01-01T00:00:00Z end=2025-01-01T01:00:00Z step=0s range=1m0s └── Parallelize └── Projection all=true drop=(ambiguous.__error__, ambiguous.__error_details__) - └── Projection all=true expand=(PARSE_JSON(builtin.message, [])) - └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [])) - └── Filter predicate[0]=EQ(ambiguous.detected_level, "error") - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 projections=(ambiguous.detected_level, builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Compat src=parsed dst=parsed collision=label + └── Projection all=true expand=(PARSE_JSON(builtin.message, [])) + └── Compat src=parsed dst=parsed collision=label + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [])) + └── Filter predicate[0]=EQ(ambiguous.detected_level, "error") + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 projections=(ambiguous.detected_level, builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, @@ -269,11 +274,12 @@ Limit offset=0 limit=1000 └── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 └── Parallelize └── TopK sort_by=builtin.timestamp ascending=false nulls_first=false k=1000 - └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Compat src=parsed dst=parsed collision=label + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message)) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 predicate[0]=GTE(builtin.timestamp, 2025-01-01T00:00:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, { @@ -283,11 +289,12 @@ Limit offset=0 limit=1000 VectorAggregation operation=sum group_by=(ambiguous.bar) └── RangeAggregation operation=count start=2025-01-01T00:00:00Z end=2025-01-01T01:00:00Z step=0s range=1m0s partition_by=(ambiguous.bar) └── Parallelize - └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [bar])) - └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 projections=(builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) - ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() - └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() + └── Compat src=parsed dst=parsed collision=label + └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [bar])) + └── Compat src=metadata dst=metadata collision=label + └── ScanSet num_targets=2 projections=(builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() + └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, }, } diff --git a/pkg/engine/internal/util/dag/dag.go b/pkg/engine/internal/util/dag/dag.go index 7f4b83962f073..5779c330c435c 100644 --- a/pkg/engine/internal/util/dag/dag.go +++ b/pkg/engine/internal/util/dag/dag.go @@ -84,10 +84,10 @@ func (g *Graph[NodeType]) AddEdge(e Edge[NodeType]) error { return fmt.Errorf("parent and child nodes must not be zero values") } if !g.nodes.Contains(e.Parent) { - return fmt.Errorf("node %s does not exist in graph", e.Parent.ID()) + return fmt.Errorf("parent node %s does not exist in graph", e.Parent.ID()) } if !g.nodes.Contains(e.Child) { - return fmt.Errorf("node %s does not exist in graph", e.Child.ID()) + return fmt.Errorf("child node %s does not exist in graph", e.Child.ID()) } // Uniquely add the edges. From 7e0f44eba29e7a21df71c71f2c2b8a67572f28d2 Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Thu, 30 Oct 2025 20:46:25 -0600 Subject: [PATCH 10/12] chore: panic if we see an unexpected type --- pkg/engine/internal/executor/column.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pkg/engine/internal/executor/column.go b/pkg/engine/internal/executor/column.go index d66aed786b106..b3b66381ac199 100644 --- a/pkg/engine/internal/executor/column.go +++ b/pkg/engine/internal/executor/column.go @@ -1,6 +1,7 @@ package executor import ( + "fmt" "slices" "github.com/apache/arrow-go/v18/arrow" @@ -54,13 +55,15 @@ func NewScalar(value types.Literal, rows int) arrow.Array { case *array.ListBuilder: //TODO(twhitney): currently only supporting string list, but we can add more types here as we need them value, ok := value.Any().([]string) - if ok { - valueBuilder := builder.ValueBuilder().(*array.StringBuilder) - for range rows { - builder.Append(true) - for _, val := range value { - valueBuilder.Append(val) - } + if !ok { + panic(fmt.Errorf("unsupported list literal type: %T", value)) + } + + valueBuilder := builder.ValueBuilder().(*array.StringBuilder) + for range rows { + builder.Append(true) + for _, val := range value { + valueBuilder.Append(val) } } } From 4e5ab62c5a8e5e3109852e7026f7f8ce0a0357db Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Thu, 30 Oct 2025 21:02:10 -0600 Subject: [PATCH 11/12] fix: change order nodes are added to graph --- .../internal/planner/physical/planner.go | 19 +++++++++---------- pkg/engine/internal/planner/planner_test.go | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pkg/engine/internal/planner/physical/planner.go b/pkg/engine/internal/planner/physical/planner.go index f77efcaddeed6..11f876534dc72 100644 --- a/pkg/engine/internal/planner/physical/planner.go +++ b/pkg/engine/internal/planner/physical/planner.go @@ -300,8 +300,7 @@ func (p *Planner) processProjection(lp *logical.Projection, ctx *Context) (Node, } } - var node Node - node = &Projection{ + var node Node = &Projection{ Expressions: expressions, All: lp.All, Expand: lp.Expand, @@ -309,6 +308,14 @@ func (p *Planner) processProjection(lp *logical.Projection, ctx *Context) (Node, } p.plan.graph.Add(node) + child, err := p.process(lp.Relation, ctx) + if err != nil { + return nil, err + } + if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: node, Child: child}); err != nil { + return nil, err + } + if needsCompat && p.context.v1Compatible { compat := &ColumnCompat{ Source: types.ColumnTypeParsed, @@ -322,14 +329,6 @@ func (p *Planner) processProjection(lp *logical.Projection, ctx *Context) (Node, } } - child, err := p.process(lp.Relation, ctx) - if err != nil { - return nil, err - } - if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: node, Child: child}); err != nil { - return nil, err - } - return node, nil } diff --git a/pkg/engine/internal/planner/planner_test.go b/pkg/engine/internal/planner/planner_test.go index cf81c8491e456..5107ad4532417 100644 --- a/pkg/engine/internal/planner/planner_test.go +++ b/pkg/engine/internal/planner/planner_test.go @@ -292,7 +292,7 @@ VectorAggregation operation=sum group_by=(ambiguous.bar) └── Compat src=parsed dst=parsed collision=label └── Projection all=true expand=(PARSE_LOGFMT(builtin.message, [bar])) └── Compat src=metadata dst=metadata collision=label - └── ScanSet num_targets=2 projections=(builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) + └── ScanSet num_targets=2 projections=(ambiguous.bar, builtin.message, builtin.timestamp) predicate[0]=GTE(builtin.timestamp, 2024-12-31T23:59:00Z) predicate[1]=LT(builtin.timestamp, 2025-01-01T01:00:00Z) ├── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=1 projections=() └── @target type=ScanTypeDataObject location=objects/00/0000000000.dataobj streams=5 section_id=0 projections=() `, From 1178cc2cd6f55b85a01301ab834203050e404294 Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Thu, 30 Oct 2025 21:09:47 -0600 Subject: [PATCH 12/12] fix: a few more compat nodes in test --- pkg/engine/internal/planner/physical/planner_test.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pkg/engine/internal/planner/physical/planner_test.go b/pkg/engine/internal/planner/physical/planner_test.go index cd0abee45a33c..5c633015e520d 100644 --- a/pkg/engine/internal/planner/physical/planner_test.go +++ b/pkg/engine/internal/planner/physical/planner_test.go @@ -304,8 +304,12 @@ func TestPlanner_Convert_WithParse(t *testing.T) { children := physicalPlan.Children(filterNode) require.Len(t, children, 1) + compatNode, ok := children[0].(*ColumnCompat) + require.True(t, ok, "Filter's child should be ColumnCompat") + children = physicalPlan.Children(compatNode) + projectionNode, ok := children[0].(*Projection) - require.True(t, ok, "Filter's child should be Projection") + require.True(t, ok, "ColumnCompat's child should be Projection") require.Len(t, projectionNode.Expressions, 1) expr, ok := projectionNode.Expressions[0].(*VariadicExpr) @@ -396,8 +400,12 @@ func TestPlanner_Convert_WithParse(t *testing.T) { children = physicalPlan.Children(filterNode) require.Len(t, children, 1) + compatNode, ok := children[0].(*ColumnCompat) + require.True(t, ok, "Filter's child should be ColumnCompat") + children = physicalPlan.Children(compatNode) + projectionNode, ok := children[0].(*Projection) - require.True(t, ok, "Filter's child should be Projection") + require.True(t, ok, "ColumnCompat's child should be Projection") require.Len(t, projectionNode.Expressions, 1) expr, ok := projectionNode.Expressions[0].(*VariadicExpr)