From 46a2b7e740ca2a9484863d618ccc02a79b80591c Mon Sep 17 00:00:00 2001
From: urasmutlu
Date: Wed, 11 Feb 2026 00:50:01 +0000
Subject: [PATCH] feat(gmail): add --safe flag to get and thread get commands
Sanitize email output by stripping HTML via a full parser, removing
URLs to prevent phishing/tracking, and decoding HTML entities. In JSON
mode a sanitized bodies map is provided and raw body data is cleared.
Co-Authored-By: Claude Opus 4.6
---
README.md | 8 +
internal/cmd/gmail_get.go | 48 +++--
internal/cmd/gmail_get_cmd_test.go | 156 +++++++++++++++++
internal/cmd/gmail_thread.go | 150 ++++++++++++++--
internal/cmd/gmail_thread_run_test.go | 100 +++++++++++
internal/cmd/gmail_thread_test.go | 241 ++++++++++++++++++++++++++
6 files changed, 681 insertions(+), 22 deletions(-)
diff --git a/README.md b/README.md
index d06683b2..3169acb1 100644
--- a/README.md
+++ b/README.md
@@ -537,8 +537,10 @@ gog gmail search 'newer_than:7d' --max 10
gog gmail thread get
gog gmail thread get --download # Download attachments to current dir
gog gmail thread get --download --out-dir ./attachments
+gog gmail thread get --safe # Safe mode (see below)
gog gmail get
gog gmail get --format metadata
+gog gmail get --safe # Safe mode (see below)
gog gmail attachment
gog gmail attachment --out ./attachment.bin
gog gmail url # Print Gmail web URL
@@ -595,6 +597,12 @@ gog gmail watch serve --bind 0.0.0.0 --verify-oidc --oidc-email --hook
gog gmail history --since
```
+Safe mode (`--safe`):
+- Strips all HTML using a full parser (not regex), removing scripts, styles, and tags
+- Replaces all URLs with `[url removed]` to prevent phishing and tracking
+- Decodes HTML entities to catch obfuscated URLs
+- In JSON mode, provides a sanitized `bodies` map and clears raw body data from the payload
+
Gmail watch (Pub/Sub push):
- Create Pub/Sub topic + push subscription (OIDC preferred; shared token ok for dev).
- Full flow + payload details: `docs/watch.md`.
diff --git a/internal/cmd/gmail_get.go b/internal/cmd/gmail_get.go
index 2c3625d8..1f86d92a 100644
--- a/internal/cmd/gmail_get.go
+++ b/internal/cmd/gmail_get.go
@@ -15,6 +15,7 @@ type GmailGetCmd struct {
MessageID string `arg:"" name:"messageId" help:"Message ID"`
Format string `name:"format" help:"Message format: full|metadata|raw" default:"full"`
Headers string `name:"headers" help:"Metadata headers (comma-separated; only for --format=metadata)"`
+ Safe bool `name:"safe" help:"Sanitize output: strip HTML, remove URLs, decode entities"`
}
const (
@@ -78,18 +79,31 @@ func (c *GmailGetCmd) Run(ctx context.Context, flags *RootFlags) error {
"subject": headerValue(msg.Payload, "Subject"),
"date": headerValue(msg.Payload, "Date"),
}
+ if c.Safe {
+ for k, v := range headers {
+ headers[k] = sanitizeText(v)
+ }
+ }
payload := map[string]any{
"message": msg,
"headers": headers,
}
- if unsubscribe != "" {
+ if unsubscribe != "" && !c.Safe {
payload["unsubscribe"] = unsubscribe
}
if format == gmailFormatFull {
- if body := bestBodyText(msg.Payload); body != "" {
+ if c.Safe {
+ safeBody, isHTML := bestBodyForDisplay(msg.Payload)
+ if safeBody != "" {
+ payload["body"] = sanitizeBodyText(safeBody, isHTML)
+ }
+ } else if body := bestBodyText(msg.Payload); body != "" {
payload["body"] = body
}
}
+ if c.Safe {
+ clearPayloadBodies(msg.Payload)
+ }
if format == gmailFormatFull || format == gmailFormatMetadata {
attachments := collectAttachments(msg.Payload)
if len(attachments) > 0 {
@@ -117,11 +131,17 @@ func (c *GmailGetCmd) Run(ctx context.Context, flags *RootFlags) error {
u.Out().Println(string(decoded))
return nil
case gmailFormatMetadata, gmailFormatFull:
- u.Out().Printf("from\t%s", headerValue(msg.Payload, "From"))
- u.Out().Printf("to\t%s", headerValue(msg.Payload, "To"))
- u.Out().Printf("subject\t%s", headerValue(msg.Payload, "Subject"))
+ if c.Safe {
+ u.Out().Printf("from\t%s", sanitizeText(headerValue(msg.Payload, "From")))
+ u.Out().Printf("to\t%s", sanitizeText(headerValue(msg.Payload, "To")))
+ u.Out().Printf("subject\t%s", sanitizeText(headerValue(msg.Payload, "Subject")))
+ } else {
+ u.Out().Printf("from\t%s", headerValue(msg.Payload, "From"))
+ u.Out().Printf("to\t%s", headerValue(msg.Payload, "To"))
+ u.Out().Printf("subject\t%s", headerValue(msg.Payload, "Subject"))
+ }
u.Out().Printf("date\t%s", headerValue(msg.Payload, "Date"))
- if unsubscribe != "" {
+ if unsubscribe != "" && !c.Safe {
u.Out().Printf("unsubscribe\t%s", unsubscribe)
}
attachments := attachmentOutputs(collectAttachments(msg.Payload))
@@ -130,10 +150,18 @@ func (c *GmailGetCmd) Run(ctx context.Context, flags *RootFlags) error {
printAttachmentLines(u.Out(), attachments)
}
if format == gmailFormatFull {
- body := bestBodyText(msg.Payload)
- if body != "" {
- u.Out().Println("")
- u.Out().Println(body)
+ if c.Safe {
+ body, isHTML := bestBodyForDisplay(msg.Payload)
+ if body != "" {
+ u.Out().Println("")
+ u.Out().Println(sanitizeBodyText(body, isHTML))
+ }
+ } else {
+ body := bestBodyText(msg.Payload)
+ if body != "" {
+ u.Out().Println("")
+ u.Out().Println(body)
+ }
}
}
return nil
diff --git a/internal/cmd/gmail_get_cmd_test.go b/internal/cmd/gmail_get_cmd_test.go
index f30de232..66a04cea 100644
--- a/internal/cmd/gmail_get_cmd_test.go
+++ b/internal/cmd/gmail_get_cmd_test.go
@@ -474,3 +474,159 @@ func TestGmailGetCmd_RawEmpty(t *testing.T) {
t.Fatalf("unexpected stderr: %q", errOut)
}
}
+
+func TestGmailGetCmd_Safe_JSON(t *testing.T) {
+ origNew := newGmailService
+ t.Cleanup(func() { newGmailService = origNew })
+
+ htmlBody := base64.RawURLEncoding.EncodeToString([]byte(
+ `Hello https://phish.com/steal
`,
+ ))
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if !strings.Contains(r.URL.Path, "/gmail/v1/users/me/messages/") {
+ http.NotFound(w, r)
+ return
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(map[string]any{
+ "id": "m1",
+ "threadId": "t1",
+ "labelIds": []string{"INBOX"},
+ "payload": map[string]any{
+ "mimeType": "text/html",
+ "body": map[string]any{"data": htmlBody},
+ "headers": []map[string]any{
+ {"name": "From", "value": "a@example.com"},
+ {"name": "To", "value": "b@example.com"},
+ {"name": "Subject", "value": "Visit https://evil.com now"},
+ {"name": "Date", "value": "Fri, 26 Dec 2025 10:00:00 +0000"},
+ {"name": "List-Unsubscribe", "value": ""},
+ },
+ },
+ })
+ }))
+ defer srv.Close()
+
+ svc, err := gmail.NewService(context.Background(),
+ option.WithoutAuthentication(),
+ option.WithHTTPClient(srv.Client()),
+ option.WithEndpoint(srv.URL+"/"),
+ )
+ if err != nil {
+ t.Fatalf("NewService: %v", err)
+ }
+ newGmailService = func(context.Context, string) (*gmail.Service, error) { return svc, nil }
+
+ flags := &RootFlags{Account: "a@b.com"}
+ out := captureStdout(t, func() {
+ _ = captureStderr(t, func() {
+ u, uiErr := ui.New(ui.Options{Stdout: io.Discard, Stderr: io.Discard, Color: "never"})
+ if uiErr != nil {
+ t.Fatalf("ui.New: %v", uiErr)
+ }
+ ctx := ui.WithUI(context.Background(), u)
+ ctx = outfmt.WithMode(ctx, outfmt.Mode{JSON: true})
+
+ cmd := &GmailGetCmd{Safe: true}
+ if err := runKong(t, cmd, []string{"m1", "--format", "full", "--safe"}, ctx, flags); err != nil {
+ t.Fatalf("execute: %v", err)
+ }
+ })
+ })
+
+ var parsed map[string]any
+ if err := json.Unmarshal([]byte(out), &parsed); err != nil {
+ t.Fatalf("json parse: %v", err)
+ }
+
+ // Body should be sanitized
+ body, _ := parsed["body"].(string)
+ if strings.Contains(body, "https://") {
+ t.Fatalf("--safe body should not contain URLs, got: %q", body)
+ }
+ if !strings.Contains(body, "Hello") {
+ t.Fatalf("--safe body should contain 'Hello', got: %q", body)
+ }
+
+ // Unsubscribe should not be present
+ if _, ok := parsed["unsubscribe"]; ok {
+ t.Fatalf("--safe JSON should not include unsubscribe link")
+ }
+
+ // Headers should be sanitized
+ headers, _ := parsed["headers"].(map[string]any)
+ subject, _ := headers["subject"].(string)
+ if strings.Contains(subject, "https://") {
+ t.Fatalf("--safe subject should not contain URLs, got: %q", subject)
+ }
+ if !strings.Contains(subject, "[url removed]") {
+ t.Fatalf("--safe subject should contain [url removed], got: %q", subject)
+ }
+}
+
+func TestGmailGetCmd_Safe_Text(t *testing.T) {
+ origNew := newGmailService
+ t.Cleanup(func() { newGmailService = origNew })
+
+ bodyData := base64.RawURLEncoding.EncodeToString([]byte("Hello visit https://phish.com/login for details"))
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if !strings.Contains(r.URL.Path, "/gmail/v1/users/me/messages/") {
+ http.NotFound(w, r)
+ return
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(map[string]any{
+ "id": "m1",
+ "threadId": "t1",
+ "labelIds": []string{"INBOX"},
+ "payload": map[string]any{
+ "mimeType": "text/plain",
+ "body": map[string]any{"data": bodyData},
+ "headers": []map[string]any{
+ {"name": "From", "value": "a@example.com"},
+ {"name": "To", "value": "b@example.com"},
+ {"name": "Subject", "value": "Urgent https://evil.com action"},
+ {"name": "Date", "value": "Fri, 26 Dec 2025 10:00:00 +0000"},
+ {"name": "List-Unsubscribe", "value": ""},
+ },
+ },
+ })
+ }))
+ defer srv.Close()
+
+ svc, err := gmail.NewService(context.Background(),
+ option.WithoutAuthentication(),
+ option.WithHTTPClient(srv.Client()),
+ option.WithEndpoint(srv.URL+"/"),
+ )
+ if err != nil {
+ t.Fatalf("NewService: %v", err)
+ }
+ newGmailService = func(context.Context, string) (*gmail.Service, error) { return svc, nil }
+
+ flags := &RootFlags{Account: "a@b.com"}
+ out := captureStdout(t, func() {
+ _ = captureStderr(t, func() {
+ u, uiErr := ui.New(ui.Options{Stdout: os.Stdout, Stderr: io.Discard, Color: "never"})
+ if uiErr != nil {
+ t.Fatalf("ui.New: %v", uiErr)
+ }
+ ctx := ui.WithUI(context.Background(), u)
+
+ cmd := &GmailGetCmd{Safe: true}
+ if err := runKong(t, cmd, []string{"m1", "--format", "full", "--safe"}, ctx, flags); err != nil {
+ t.Fatalf("execute: %v", err)
+ }
+ })
+ })
+
+ if strings.Contains(out, "https://") {
+ t.Fatalf("--safe text output should not contain URLs, got: %q", out)
+ }
+ if !strings.Contains(out, "[url removed]") {
+ t.Fatalf("--safe text output should contain [url removed], got: %q", out)
+ }
+ if strings.Contains(out, "unsubscribe") {
+ t.Fatalf("--safe text output should not show unsubscribe link, got: %q", out)
+ }
+}
diff --git a/internal/cmd/gmail_thread.go b/internal/cmd/gmail_thread.go
index 52ccac96..97a3a2fd 100644
--- a/internal/cmd/gmail_thread.go
+++ b/internal/cmd/gmail_thread.go
@@ -6,6 +6,7 @@ import (
"encoding/base64"
"errors"
"fmt"
+ htmlpkg "html"
"io"
"mime"
"mime/quotedprintable"
@@ -15,6 +16,7 @@ import (
"regexp"
"strings"
+ "golang.org/x/net/html"
"golang.org/x/net/html/charset"
"google.golang.org/api/gmail/v1"
@@ -46,6 +48,102 @@ func stripHTMLTags(s string) string {
return strings.TrimSpace(s)
}
+// --safe mode sanitization functions.
+
+// urlPattern matches HTTP and HTTPS URLs.
+var urlPattern = regexp.MustCompile(`https?://[^\s<>"'` + "`" + `\]\)]+`)
+
+// blockElements is the set of HTML elements that produce visual line breaks.
+var blockElements = map[string]bool{
+ "div": true, "p": true, "br": true, "li": true, "tr": true,
+ "h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
+ "blockquote": true, "pre": true, "hr": true, "table": true,
+ "ul": true, "ol": true, "dl": true, "dt": true, "dd": true,
+ "section": true, "article": true, "header": true, "footer": true,
+}
+
+// safeExtractTextFromHTML uses the golang.org/x/net/html tokenizer to properly
+// extract text content from HTML, skipping script and style blocks.
+// Unlike stripHTMLTags (regex-based), this uses a full HTML parser for
+// robust handling of malformed HTML, making it suitable for --safe mode.
+func safeExtractTextFromHTML(s string) string {
+ tokenizer := html.NewTokenizer(strings.NewReader(s))
+ var buf strings.Builder
+ skip := false
+ for {
+ tt := tokenizer.Next()
+ switch tt {
+ case html.ErrorToken:
+ // Collapse whitespace in the final result.
+ result := whitespacePattern.ReplaceAllString(buf.String(), " ")
+ return strings.TrimSpace(result)
+ case html.StartTagToken, html.SelfClosingTagToken:
+ tn, _ := tokenizer.TagName()
+ tag := string(tn)
+ if tag == "script" || tag == "style" {
+ skip = true
+ }
+ if blockElements[tag] {
+ buf.WriteByte(' ')
+ }
+ case html.EndTagToken:
+ tn, _ := tokenizer.TagName()
+ tag := string(tn)
+ if tag == "script" || tag == "style" {
+ skip = false
+ }
+ if blockElements[tag] {
+ buf.WriteByte(' ')
+ }
+ case html.TextToken:
+ if !skip {
+ buf.Write(tokenizer.Text())
+ }
+ }
+ }
+}
+
+// stripURLs replaces all HTTP/HTTPS URLs with [url removed].
+func stripURLs(s string) string {
+ return urlPattern.ReplaceAllString(s, "[url removed]")
+}
+
+// sanitizeBodyText sanitizes email body content for safe display.
+// It extracts text from HTML (if needed), decodes HTML entities, and strips URLs.
+func sanitizeBodyText(body string, isHTML bool) string {
+ if body == "" {
+ return ""
+ }
+ text := body
+ if isHTML {
+ text = safeExtractTextFromHTML(text)
+ }
+ text = htmlpkg.UnescapeString(text)
+ text = stripURLs(text)
+ text = whitespacePattern.ReplaceAllString(text, " ")
+ return strings.TrimSpace(text)
+}
+
+// sanitizeText applies lightweight sanitization to header values and other text.
+func sanitizeText(s string) string {
+ s = htmlpkg.UnescapeString(s)
+ return stripURLs(s)
+}
+
+// clearPayloadBodies recursively clears body data on all text/* MIME parts
+// to prevent raw content from leaking into JSON output.
+func clearPayloadBodies(p *gmail.MessagePart) {
+ if p == nil {
+ return
+ }
+ if strings.HasPrefix(strings.ToLower(p.MimeType), "text/") && p.Body != nil {
+ p.Body.Data = ""
+ }
+ for _, part := range p.Parts {
+ clearPayloadBodies(part)
+ }
+}
+
type GmailThreadCmd struct {
Get GmailThreadGetCmd `cmd:"" name:"get" default:"withargs" help:"Get a thread with all messages (optionally download attachments)"`
Modify GmailThreadModifyCmd `cmd:"" name:"modify" help:"Modify labels on all messages in a thread"`
@@ -56,6 +154,7 @@ type GmailThreadGetCmd struct {
ThreadID string `arg:"" name:"threadId" help:"Thread ID"`
Download bool `name:"download" help:"Download attachments"`
Full bool `name:"full" help:"Show full message bodies"`
+ Safe bool `name:"safe" help:"Sanitize output: strip HTML, remove URLs, decode entities"`
OutputDir OutputDirFlag `embed:""`
}
@@ -108,6 +207,22 @@ func (c *GmailThreadGetCmd) Run(ctx context.Context, flags *RootFlags) error {
downloadedFiles = append(downloadedFiles, attachmentDownloadSummaries(downloads)...)
}
}
+ if c.Safe && thread != nil {
+ bodies := make(map[string]string, len(thread.Messages))
+ for _, msg := range thread.Messages {
+ if msg == nil || msg.Id == "" {
+ continue
+ }
+ body, isHTML := bestBodyForDisplay(msg.Payload)
+ bodies[msg.Id] = sanitizeBodyText(body, isHTML)
+ clearPayloadBodies(msg.Payload)
+ }
+ return outfmt.WriteJSON(os.Stdout, map[string]any{
+ "thread": thread,
+ "bodies": bodies,
+ "downloaded": downloadedFiles,
+ })
+ }
return outfmt.WriteJSON(os.Stdout, map[string]any{
"thread": thread,
"downloaded": downloadedFiles,
@@ -127,18 +242,29 @@ func (c *GmailThreadGetCmd) Run(ctx context.Context, flags *RootFlags) error {
continue
}
u.Out().Printf("=== Message %d/%d: %s ===", i+1, len(thread.Messages), msg.Id)
- u.Out().Printf("From: %s", headerValue(msg.Payload, "From"))
- u.Out().Printf("To: %s", headerValue(msg.Payload, "To"))
- u.Out().Printf("Subject: %s", headerValue(msg.Payload, "Subject"))
+ if c.Safe {
+ u.Out().Printf("From: %s", sanitizeText(headerValue(msg.Payload, "From")))
+ u.Out().Printf("To: %s", sanitizeText(headerValue(msg.Payload, "To")))
+ u.Out().Printf("Subject: %s", sanitizeText(headerValue(msg.Payload, "Subject")))
+ } else {
+ u.Out().Printf("From: %s", headerValue(msg.Payload, "From"))
+ u.Out().Printf("To: %s", headerValue(msg.Payload, "To"))
+ u.Out().Printf("Subject: %s", headerValue(msg.Payload, "Subject"))
+ }
u.Out().Printf("Date: %s", headerValue(msg.Payload, "Date"))
u.Out().Println("")
body, isHTML := bestBodyForDisplay(msg.Payload)
if body != "" {
- cleanBody := body
- if isHTML {
- // Strip HTML tags for cleaner text output
- cleanBody = stripHTMLTags(body)
+ var cleanBody string
+ if c.Safe {
+ cleanBody = sanitizeBodyText(body, isHTML)
+ } else {
+ cleanBody = body
+ if isHTML {
+ // Strip HTML tags for cleaner text output
+ cleanBody = stripHTMLTags(body)
+ }
}
// Limit body preview to avoid overwhelming output
// Use runes to avoid breaking multi-byte UTF-8 characters
@@ -361,8 +487,8 @@ func bestBodyText(p *gmail.MessagePart) string {
if plain != "" {
return plain
}
- html := findPartBody(p, "text/html")
- return html
+ htmlBody := findPartBody(p, "text/html")
+ return htmlBody
}
func bestBodyForDisplay(p *gmail.MessagePart) (string, bool) {
@@ -376,11 +502,11 @@ func bestBodyForDisplay(p *gmail.MessagePart) (string, bool) {
}
return plain, false
}
- html := findPartBody(p, "text/html")
- if html == "" {
+ htmlBody := findPartBody(p, "text/html")
+ if htmlBody == "" {
return "", false
}
- return html, true
+ return htmlBody, true
}
func findPartBody(p *gmail.MessagePart, mimeType string) string {
diff --git a/internal/cmd/gmail_thread_run_test.go b/internal/cmd/gmail_thread_run_test.go
index 648f6da2..8e686ca9 100644
--- a/internal/cmd/gmail_thread_run_test.go
+++ b/internal/cmd/gmail_thread_run_test.go
@@ -248,3 +248,103 @@ func TestGmailThreadGetAndAttachments_JSON(t *testing.T) {
t.Fatalf("unexpected empty attachments output: %q", emptyAttachOut)
}
}
+
+func TestGmailThreadGet_Safe(t *testing.T) {
+ origNew := newGmailService
+ t.Cleanup(func() { newGmailService = origNew })
+
+ htmlBody := base64.RawURLEncoding.EncodeToString([]byte(
+ `Hello visit https://phish.com/login now
`,
+ ))
+ threadResp := map[string]any{
+ "id": "t1",
+ "messages": []map[string]any{
+ {
+ "id": "m1",
+ "payload": map[string]any{
+ "headers": []map[string]any{
+ {"name": "From", "value": "a@example.com"},
+ {"name": "To", "value": "b@example.com"},
+ {"name": "Subject", "value": "Check https://evil.com now"},
+ {"name": "Date", "value": "Mon, 1 Jan 2025 00:00:00 +0000"},
+ },
+ "mimeType": "text/html",
+ "body": map[string]any{
+ "data": htmlBody,
+ },
+ },
+ },
+ },
+ }
+
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ path := strings.TrimPrefix(r.URL.Path, "/gmail/v1")
+ if r.Method == http.MethodGet && path == "/users/me/threads/t1" {
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(threadResp)
+ return
+ }
+ http.NotFound(w, r)
+ }))
+ defer srv.Close()
+
+ svc, err := gmail.NewService(context.Background(),
+ option.WithoutAuthentication(),
+ option.WithHTTPClient(srv.Client()),
+ option.WithEndpoint(srv.URL+"/"),
+ )
+ if err != nil {
+ t.Fatalf("NewService: %v", err)
+ }
+ newGmailService = func(context.Context, string) (*gmail.Service, error) { return svc, nil }
+
+ // Test text output with --safe
+ textOut := captureStdout(t, func() {
+ _ = captureStderr(t, func() {
+ if err := Execute([]string{"--account", "a@b.com", "gmail", "thread", "get", "t1", "--safe"}); err != nil {
+ t.Fatalf("Execute --safe text: %v", err)
+ }
+ })
+ })
+ if strings.Contains(textOut, "https://") {
+ t.Fatalf("--safe text output should not contain URLs, got: %q", textOut)
+ }
+ if !strings.Contains(textOut, "[url removed]") {
+ t.Fatalf("--safe text output should contain [url removed], got: %q", textOut)
+ }
+ if !strings.Contains(textOut, "Hello") {
+ t.Fatalf("--safe text output should contain plain text 'Hello', got: %q", textOut)
+ }
+ if strings.Contains(textOut, "safe text",
+ want: "safe text",
+ },
+ {
+ name: "style block removed",
+ input: "visible",
+ want: "visible",
+ },
+ {
+ name: "nested tags",
+ input: "inner
",
+ want: "inner",
+ },
+ {
+ name: "block elements add spaces",
+ input: "first
second
",
+ want: "first second",
+ },
+ {
+ name: "malformed HTML consumed safely",
+ input: `a & b
",
+ want: "a & b",
+ },
+ {
+ name: "complex email HTML",
+ input: `Hello
World
`,
+ want: "Hello World",
+ },
+ {
+ name: "empty input",
+ input: "",
+ want: "",
+ },
+ {
+ name: "plain text unchanged",
+ input: "no tags here",
+ want: "no tags here",
+ },
+ {
+ name: "self closing tags",
+ input: "line1
line2",
+ want: "line1 line2",
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := safeExtractTextFromHTML(tt.input)
+ if got != tt.want {
+ t.Errorf("safeExtractTextFromHTML(%q) = %q, want %q", tt.input, got, tt.want)
+ }
+ })
+ }
+}
+
+func TestStripURLs(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ want string
+ }{
+ {
+ name: "http URL",
+ input: "visit http://example.com for info",
+ want: "visit [url removed] for info",
+ },
+ {
+ name: "https URL",
+ input: "click https://example.com/page",
+ want: "click [url removed]",
+ },
+ {
+ name: "URL with query params",
+ input: "track https://track.example.com/open?id=abc123&utm_source=email here",
+ want: "track [url removed] here",
+ },
+ {
+ name: "multiple URLs",
+ input: "see https://a.com and http://b.com ok",
+ want: "see [url removed] and [url removed] ok",
+ },
+ {
+ name: "no URLs unchanged",
+ input: "plain text with no links",
+ want: "plain text with no links",
+ },
+ {
+ name: "empty string",
+ input: "",
+ want: "",
+ },
+ {
+ name: "URL at start",
+ input: "https://evil.com/phish is bad",
+ want: "[url removed] is bad",
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := stripURLs(tt.input)
+ if got != tt.want {
+ t.Errorf("stripURLs(%q) = %q, want %q", tt.input, got, tt.want)
+ }
+ })
+ }
+}
+
+func TestSanitizeBodyText(t *testing.T) {
+ tests := []struct {
+ name string
+ body string
+ isHTML bool
+ want string
+ }{
+ {
+ name: "HTML with URL",
+ body: `Click here now
`,
+ isHTML: true,
+ want: "Click here now",
+ },
+ {
+ name: "plain text with URL",
+ body: "Visit https://example.com for details",
+ isHTML: false,
+ want: "Visit [url removed] for details",
+ },
+ {
+ name: "HTML entities decoded then URL stripped",
+ body: "check https://evil.com/payload here",
+ isHTML: false,
+ want: "check [url removed] here",
+ },
+ {
+ name: "HTML with script and tracking",
+ body: `Hello https://phish.com
`,
+ isHTML: true,
+ want: "Hello [url removed]",
+ },
+ {
+ name: "empty body",
+ body: "",
+ isHTML: false,
+ want: "",
+ },
+ {
+ name: "plain text no URLs",
+ body: "Just a normal message",
+ isHTML: false,
+ want: "Just a normal message",
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := sanitizeBodyText(tt.body, tt.isHTML)
+ if got != tt.want {
+ t.Errorf("sanitizeBodyText(%q, %v) = %q, want %q", tt.body, tt.isHTML, got, tt.want)
+ }
+ })
+ }
+}
+
+func TestSanitizeText(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ want string
+ }{
+ {
+ name: "URL in subject",
+ input: "Check https://evil.com now",
+ want: "Check [url removed] now",
+ },
+ {
+ name: "HTML entity decoded",
+ input: "a & b",
+ want: "a & b",
+ },
+ {
+ name: "no changes needed",
+ input: "Normal Subject Line",
+ want: "Normal Subject Line",
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := sanitizeText(tt.input)
+ if got != tt.want {
+ t.Errorf("sanitizeText(%q) = %q, want %q", tt.input, got, tt.want)
+ }
+ })
+ }
+}
+
+func TestClearPayloadBodies(t *testing.T) {
+ p := &gmail.MessagePart{
+ MimeType: "multipart/alternative",
+ Parts: []*gmail.MessagePart{
+ {
+ MimeType: "text/plain",
+ Body: &gmail.MessagePartBody{Data: "c29tZSBkYXRh"},
+ },
+ {
+ MimeType: "text/html",
+ Body: &gmail.MessagePartBody{Data: "PHA-aHRtbDwvcD4"},
+ },
+ {
+ MimeType: "image/png",
+ Body: &gmail.MessagePartBody{Data: "imagedata", AttachmentId: "att1"},
+ },
+ },
+ }
+ clearPayloadBodies(p)
+
+ if p.Parts[0].Body.Data != "" {
+ t.Errorf("text/plain body should be cleared, got %q", p.Parts[0].Body.Data)
+ }
+ if p.Parts[1].Body.Data != "" {
+ t.Errorf("text/html body should be cleared, got %q", p.Parts[1].Body.Data)
+ }
+ if p.Parts[2].Body.Data != "imagedata" {
+ t.Errorf("image/png body should be preserved, got %q", p.Parts[2].Body.Data)
+ }
+}
+
func encodeBase64URL(value string) string {
return base64.RawURLEncoding.EncodeToString([]byte(value))
}