Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tuple/decoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ func (dec *Decoder[S]) Decode(r io.Reader) (*CompactSketch[S], error) {
return nil, err
}

if v, ok := any(summary).(AfterDecodeValidator); ok {
if err := v.ValidateAfterDecode(); err != nil {
return nil, err
}
}

entries[i] = entry[S]{Hash: hash, Summary: summary}
}

Expand Down
6 changes: 6 additions & 0 deletions tuple/encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ func (enc *Encoder[S]) Encode(sketch *CompactSketch[S]) error {
}
}
for _, entry := range sketch.entries {
if v, ok := any(entry.Summary).(BeforeEncodeValidator); ok {
if err := v.ValidateBeforeEncode(); err != nil {
return err
}
}

if err := binary.Write(enc.w, binary.LittleEndian, entry.Hash); err != nil {
return err
}
Expand Down
12 changes: 12 additions & 0 deletions tuple/sketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ type Summary interface {
Clone() Summary
}

// BeforeEncodeValidator is an optional interface that summaries can implement
// to validate their content before encoding.
type BeforeEncodeValidator interface {
ValidateBeforeEncode() error
}

// AfterDecodeValidator is an optional interface that summaries can implement
// to validate their content after decoding.
type AfterDecodeValidator interface {
ValidateAfterDecode() error
}

// Sketch is the base interface for tuple sketches.
// It extends Theta sketch to associate arbitrary summaries with each retained key.
type Sketch[S Summary] interface {
Expand Down
95 changes: 95 additions & 0 deletions tuple/sketch_serialization_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,27 @@ import (
"github.com/apache/datasketches-go/theta"
)

func stringSummaryWriter(w io.Writer, s *stringSummary) error {
strBytes := []byte(s.value)
if err := binary.Write(w, binary.LittleEndian, uint32(len(strBytes))); err != nil {
return err
}
_, err := w.Write(strBytes)
return err
}

func stringSummaryReader(r io.Reader) (*stringSummary, error) {
var length uint32
if err := binary.Read(r, binary.LittleEndian, &length); err != nil {
return nil, err
}
buf := make([]byte, length)
if _, err := io.ReadFull(r, buf); err != nil {
return nil, err
}
return &stringSummary{value: string(buf)}, nil
}

func int32SummaryWriter(w io.Writer, s *int32Summary) error {
return binary.Write(w, binary.LittleEndian, s.value)
}
Expand Down Expand Up @@ -443,3 +464,77 @@ func TestDecoderErrors(t *testing.T) {
assert.Contains(t, err.Error(), "sketch type mismatch")
})
}

func TestStringSummaryUTF8Validation(t *testing.T) {
t.Run("Valid UTF-8 round-trip succeeds", func(t *testing.T) {
sketch, err := NewUpdateSketch[*stringSummary, string](newStringSummary)
assert.NoError(t, err)
err = sketch.UpdateString("key1", "hello")
assert.NoError(t, err)
err = sketch.UpdateString("key2", "안녕하세요")
assert.NoError(t, err)

compact, err := sketch.Compact(true)
assert.NoError(t, err)

var buf bytes.Buffer
encoder := NewEncoder[*stringSummary](&buf, stringSummaryWriter)
err = encoder.Encode(compact)
assert.NoError(t, err)

decoded, err := Decode[*stringSummary](buf.Bytes(), theta.DefaultSeed, stringSummaryReader)
assert.NoError(t, err)
assert.Equal(t, compact.NumRetained(), decoded.NumRetained())
})

t.Run("Invalid UTF-8 encode fails", func(t *testing.T) {
sketch, err := NewUpdateSketch[*stringSummary, string](newStringSummary)
assert.NoError(t, err)
err = sketch.UpdateString("key1", "valid")
assert.NoError(t, err)

compact, err := sketch.Compact(true)
assert.NoError(t, err)

// Inject invalid UTF-8 into the summary
for i := range compact.entries {
compact.entries[i].Summary = &stringSummary{value: "bad\xff\xfe"}
}

var buf bytes.Buffer
encoder := NewEncoder[*stringSummary](&buf, stringSummaryWriter)
err = encoder.Encode(compact)
assert.Error(t, err)
assert.Contains(t, err.Error(), "invalid UTF-8 string")
})

t.Run("Invalid UTF-8 decode fails", func(t *testing.T) {
sketch, err := NewUpdateSketch[*stringSummary, string](newStringSummary)
assert.NoError(t, err)
err = sketch.UpdateString("key1", "hi")
assert.NoError(t, err)

compact, err := sketch.Compact(true)
assert.NoError(t, err)

var buf bytes.Buffer
encoder := NewEncoder[*stringSummary](&buf, stringSummaryWriter)
err = encoder.Encode(compact)
assert.NoError(t, err)

// Replace the string payload with invalid UTF-8.
// Use a custom reader that produces invalid UTF-8 for the summary.
invalidReader := func(r io.Reader) (*stringSummary, error) {
s, err := stringSummaryReader(r)
if err != nil {
return nil, err
}
s.value = "bad\xff\xfe"
return s, nil
}

_, err = Decode[*stringSummary](buf.Bytes(), theta.DefaultSeed, invalidReader)
assert.Error(t, err)
assert.Contains(t, err.Error(), "invalid UTF-8 string")
})
}
39 changes: 39 additions & 0 deletions tuple/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@

package tuple

import (
"fmt"
"unicode/utf8"
)

type int32Summary struct {
value int32
}
Expand Down Expand Up @@ -70,3 +75,37 @@ func (s int32ValueSummary) Update(value int32) {}
func newInt32ValueSummary() int32ValueSummary {
return int32ValueSummary{}
}

type stringSummary struct {
value string
}

func (s *stringSummary) Reset() {
s.value = ""
}

func (s *stringSummary) Clone() Summary {
return &stringSummary{value: s.value}
}

func (s *stringSummary) Update(value string) {
s.value = value
}

func (s *stringSummary) ValidateBeforeEncode() error {
if !utf8.ValidString(s.value) {
return fmt.Errorf("invalid UTF-8 string")
}
return nil
}

func (s *stringSummary) ValidateAfterDecode() error {
if !utf8.ValidString(s.value) {
return fmt.Errorf("invalid UTF-8 string")
}
return nil
}

func newStringSummary() *stringSummary {
return &stringSummary{}
}
Loading