From bd21420b3e2ad59435a1ca22b72be74cac997126 Mon Sep 17 00:00:00 2001 From: Luke Hagar Date: Fri, 14 Nov 2025 20:33:15 +0000 Subject: [PATCH 1/6] testing some optimizations --- Dockerfile | 4 +- README.md | 5 +- action.yml | 1 + cmd/check.go | 125 ++++++++++++++++++++++++--- entrypoint.sh | 90 ++++++------------- internal/fsurls/fsurls.go | 44 +++------- internal/web/cache.go | 132 ++++++++++++++++++++++++++++ internal/web/checker.go | 78 ++++++++++++----- internal/web/http.go | 176 +++++++++++++++++++++++++++++++++++--- internal/web/types.go | 1 + 10 files changed, 511 insertions(+), 145 deletions(-) create mode 100644 internal/web/cache.go diff --git a/Dockerfile b/Dockerfile index da6e8c9..8f6a70f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,9 @@ COPY . . RUN CGO_ENABLED=0 go build -o /usr/local/bin/slinky ./ FROM alpine:3.20 -RUN apk add --no-cache curl jq ca-certificates +# jq is used in entrypoint.sh for parsing GitHub event JSON +# ca-certificates is needed for HTTPS requests +RUN apk add --no-cache jq ca-certificates COPY --from=build /usr/local/bin/slinky /usr/local/bin/slinky COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh diff --git a/README.md b/README.md index 3ca9d36..4f3123e 100644 --- a/README.md +++ b/README.md @@ -16,15 +16,18 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - pull-requests: write + pull-requests: write # Only needed if comment-pr is enabled steps: - uses: actions/checkout@v4 - name: Run Slinky uses: LukeHagar/slinky@v1 with: targets: "docs/,README.md,**/*.md" + # comment-pr: true # Optional: post results as PR comment (requires GITHUB_TOKEN) ``` +**Note:** The `GITHUB_TOKEN` is automatically provided by GitHub Actions and is only required for PR comment functionality. Core link checking works without it. If you disable PR comments (`comment-pr: false`), you can remove the `pull-requests: write` permission. + ### Inputs - **targets**: Comma-separated paths and patterns to scan. Can be directories, files, or glob patterns (e.g. `docs/,api-specs/**/*.yaml,README.md`). Default: `**/*` diff --git a/action.yml b/action.yml index 7cb9a81..b835ef2 100644 --- a/action.yml +++ b/action.yml @@ -53,6 +53,7 @@ runs: INPUT_FAIL_ON_FAILURES: ${{ inputs.fail_on_failures }} INPUT_COMMENT_PR: ${{ inputs.comment_pr }} INPUT_STEP_SUMMARY: ${{ inputs.step_summary }} + GITHUB_TOKEN: ${{ github.token }} outputs: json_path: diff --git a/cmd/check.go b/cmd/check.go index 654e229..5fa8673 100644 --- a/cmd/check.go +++ b/cmd/check.go @@ -158,9 +158,51 @@ func init() { fmt.Printf("::debug:: Root: %s\n", displayRoot) } + // Validate and clamp numeric inputs + if maxConcurrency < 1 { + maxConcurrency = 1 + } else if maxConcurrency > 100 { + maxConcurrency = 100 + } + if timeoutSeconds < 1 { + timeoutSeconds = 1 + } else if timeoutSeconds > 300 { + timeoutSeconds = 300 // Max 5 minutes + } + // Build config timeout := time.Duration(timeoutSeconds) * time.Second - cfg := web.Config{MaxConcurrency: maxConcurrency, RequestTimeout: timeout} + + // Set up URL cache if cache path is provided via environment variable + var urlCache *web.URLCache + if cachePath := os.Getenv("SLINKY_CACHE_PATH"); cachePath != "" { + cacheTTL := 24 // Default 24 hours + if ttlStr := os.Getenv("SLINKY_CACHE_TTL_HOURS"); ttlStr != "" { + if ttl, err := time.ParseDuration(ttlStr + "h"); err == nil && ttl > 0 { + cacheTTL = int(ttl.Hours()) + } + } + urlCache = web.NewURLCache(cachePath, cacheTTL) + if err := urlCache.Load(); err != nil { + if shouldDebug() { + fmt.Printf("::debug:: Failed to load cache: %v\n", err) + } + } + // Save cache when done + defer func() { + if err := urlCache.Save(); err != nil { + if shouldDebug() { + fmt.Printf("::debug:: Failed to save cache: %v\n", err) + } + } + }() + } + + cfg := web.Config{ + MaxConcurrency: maxConcurrency, + RequestTimeout: timeout, + Cache: urlCache, + } // Prepare URL list var urls []string @@ -275,9 +317,16 @@ func init() { } // If running on a PR, post or update the comment(s), chunking as needed - if ghOK && strings.TrimSpace(finalMDPath) != "" { + // Check if PR commenting is enabled (default to true if not set) + commentPR := true + if val := os.Getenv("INPUT_COMMENT_PR"); val != "" { + commentPR = strings.EqualFold(val, "true") + } + if ghOK && commentPR && strings.TrimSpace(finalMDPath) != "" { b, rerr := os.ReadFile(finalMDPath) - if rerr == nil { + if rerr != nil { + fmt.Printf("::warning:: Failed to read markdown report for PR comment: %v\n", rerr) + } else { full := string(b) if shouldDebug() { fmt.Printf("::debug:: Report size (chars): %d\n", len(full)) @@ -286,7 +335,10 @@ func init() { if shouldDebug() { fmt.Printf("::debug:: Posting %d chunk(s)\n", len(chunks)) } - _ = upsertPRComments(ghRepo, ghPR, ghToken, chunks) + if err := upsertPRComments(ghRepo, ghPR, ghToken, chunks); err != nil { + // Non-critical error: log warning but don't fail the run + fmt.Printf("::warning:: Failed to post PR comment: %v\n", err) + } } } @@ -440,32 +492,52 @@ func chunkMarkdownByURL(body string) []string { } // upsertPRComments deletes any existing slinky comments and posts the new chunked comments in order. +// Returns error if critical failures occur, but individual comment failures are logged and ignored. func upsertPRComments(repo string, prNumber int, token string, chunks []string) error { apiBase := "https://api.github.com" listURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments?per_page=100", apiBase, repo, prNumber) - req, _ := http.NewRequest(http.MethodGet, listURL, nil) + req, err := http.NewRequest(http.MethodGet, listURL, nil) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } req.Header.Set("Authorization", "Bearer "+token) req.Header.Set("Accept", "application/vnd.github+json") resp, err := http.DefaultClient.Do(req) if err != nil { - return err + return fmt.Errorf("failed to list comments: %w", err) } defer resp.Body.Close() + + if resp.StatusCode >= 400 { + return fmt.Errorf("failed to list comments: HTTP %d", resp.StatusCode) + } + var comments []struct { ID int `json:"id"` Body string `json:"body"` } - b, _ := io.ReadAll(resp.Body) - _ = json.Unmarshal(b, &comments) + b, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read comments response: %w", err) + } + if err := json.Unmarshal(b, &comments); err != nil { + // Non-critical: continue even if we can't parse existing comments + if shouldDebug() { + fmt.Printf("::debug:: Failed to parse comments: %v\n", err) + } + } // Delete all existing slinky-report comments to avoid stale entries for _, c := range comments { if strings.Contains(c.Body, "") { delURL := fmt.Sprintf("%s/repos/%s/issues/comments/%d", apiBase, repo, c.ID) - dReq, _ := http.NewRequest(http.MethodDelete, delURL, nil) + dReq, err := http.NewRequest(http.MethodDelete, delURL, nil) + if err != nil { + continue // Skip if we can't create request + } dReq.Header.Set("Authorization", "Bearer "+token) dReq.Header.Set("Accept", "application/vnd.github+json") - _, _ = http.DefaultClient.Do(dReq) + _, _ = http.DefaultClient.Do(dReq) // Non-critical: ignore delete errors } } @@ -473,14 +545,39 @@ func upsertPRComments(repo string, prNumber int, token string, chunks []string) for idx, chunk := range chunks { body := fmt.Sprintf("%s\n%s", "", chunk) postURL := fmt.Sprintf("%s/repos/%s/issues/%d/comments", apiBase, repo, prNumber) - payload, _ := json.Marshal(map[string]string{"body": body}) - req, _ = http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload)) + payload, err := json.Marshal(map[string]string{"body": body}) + if err != nil { + if shouldDebug() { + fmt.Printf("::debug:: Failed to marshal comment payload: %v\n", err) + } + continue + } + req, err := http.NewRequest(http.MethodPost, postURL, bytes.NewReader(payload)) + if err != nil { + if shouldDebug() { + fmt.Printf("::debug:: Failed to create POST request: %v\n", err) + } + continue + } req.Header.Set("Authorization", "Bearer "+token) req.Header.Set("Accept", "application/vnd.github+json") req.Header.Set("Content-Type", "application/json") - res, _ := http.DefaultClient.Do(req) + res, err := http.DefaultClient.Do(req) + if err != nil { + if shouldDebug() { + fmt.Printf("::debug:: Failed to post chunk %d/%d: %v\n", idx+1, len(chunks), err) + } + continue + } + res.Body.Close() + if res.StatusCode >= 400 { + if shouldDebug() { + fmt.Printf("::debug:: Failed to post chunk %d/%d: HTTP %d\n", idx+1, len(chunks), res.StatusCode) + } + continue + } if shouldDebug() { - fmt.Printf("::debug:: Posted chunk %d/%d: %v\n", idx+1, len(chunks), res) + fmt.Printf("::debug:: Posted chunk %d/%d successfully\n", idx+1, len(chunks)) } } return nil diff --git a/entrypoint.sh b/entrypoint.sh index 95cfd4f..f782397 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,93 +1,55 @@ #!/bin/sh set -eu -# Set up environment variables for GitHub blob base URL +# Set up GitHub blob base URL for PR links if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then export SLINKY_REPO_BLOB_BASE_URL="${INPUT_REPO_BLOB_BASE}" elif [ -n "${GITHUB_REPOSITORY:-}" ]; then COMMIT_SHA="${GITHUB_SHA:-}" if [ -n "${GITHUB_EVENT_PATH:-}" ] && command -v jq >/dev/null 2>&1; then - PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" || true)" - if [ -n "$PR_HEAD_SHA" ]; then - COMMIT_SHA="$PR_HEAD_SHA" - fi + PR_HEAD_SHA="$(jq -r '.pull_request.head.sha // empty' "$GITHUB_EVENT_PATH" 2>/dev/null || true)" + [ -n "$PR_HEAD_SHA" ] && COMMIT_SHA="$PR_HEAD_SHA" fi - if [ -n "$COMMIT_SHA" ]; then - export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}" - fi -fi - -# Build command arguments -set -- check - -# Add optional flags -if [ -n "${INPUT_CONCURRENCY:-}" ]; then - set -- "$@" --concurrency "${INPUT_CONCURRENCY}" -fi - -if [ -n "${INPUT_TIMEOUT:-}" ]; then - set -- "$@" --timeout "${INPUT_TIMEOUT}" + [ -n "$COMMIT_SHA" ] && export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}" fi -if [ -n "${INPUT_JSON_OUT:-}" ]; then - set -- "$@" --json-out "${INPUT_JSON_OUT}" -fi +# Build slinky command arguments +ARGS="check" -if [ -n "${INPUT_MD_OUT:-}" ]; then - set -- "$@" --md-out "${INPUT_MD_OUT}" -fi +# Optional flags +[ -n "${INPUT_CONCURRENCY:-}" ] && ARGS="$ARGS --concurrency ${INPUT_CONCURRENCY}" +[ -n "${INPUT_TIMEOUT:-}" ] && ARGS="$ARGS --timeout ${INPUT_TIMEOUT}" +[ -n "${INPUT_JSON_OUT:-}" ] && ARGS="$ARGS --json-out ${INPUT_JSON_OUT}" +[ -n "${INPUT_MD_OUT:-}" ] && ARGS="$ARGS --md-out ${INPUT_MD_OUT}" +[ -n "${INPUT_REPO_BLOB_BASE:-}" ] && ARGS="$ARGS --repo-blob-base ${INPUT_REPO_BLOB_BASE}" -if [ -n "${INPUT_REPO_BLOB_BASE:-}" ]; then - set -- "$@" --repo-blob-base "${INPUT_REPO_BLOB_BASE}" -fi +# Boolean flags with defaults +[ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ] && ARGS="$ARGS --fail-on-failures=true" || ARGS="$ARGS --fail-on-failures=false" +[ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ] && ARGS="$ARGS --respect-gitignore=true" || ARGS="$ARGS --respect-gitignore=false" -if [ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ]; then - set -- "$@" --fail-on-failures=true -else - set -- "$@" --fail-on-failures=false -fi - -if [ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ]; then - set -- "$@" --respect-gitignore=true -else - set -- "$@" --respect-gitignore=false -fi - -# Add targets +# Add targets (comma-separated glob patterns) if [ -n "${INPUT_TARGETS:-}" ]; then - # Split comma-separated targets and add each one IFS=',' for target in $INPUT_TARGETS; do - target=$(echo "$target" | xargs) # trim whitespace - if [ -n "$target" ]; then - set -- "$@" "$target" - fi + target=$(echo "$target" | xargs) + [ -n "$target" ] && ARGS="$ARGS $target" done unset IFS else - # Default: scan everything - set -- "$@" "**/*" + ARGS="$ARGS **/*" fi # Debug output -if [ "${ACTIONS_STEP_DEBUG:-}" = "true" ]; then - printf "::debug:: CLI Args: slinky %s\n" "$*" -fi +[ "${ACTIONS_STEP_DEBUG:-}" = "true" ] && printf "::debug:: Running: slinky %s\n" "$ARGS" -# Execute the command -set +e -slinky "$@" -SLINKY_EXIT_CODE=$? -set -e +# Execute slinky (crawl repo with glob input, filter via .slinkignore) +slinky $ARGS +EXIT_CODE=$? # Expose outputs if [ -n "${GITHUB_OUTPUT:-}" ]; then - if [ -n "${INPUT_JSON_OUT:-}" ]; then - echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT" - fi - if [ -n "${INPUT_MD_OUT:-}" ]; then - echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT" - fi + [ -n "${INPUT_JSON_OUT:-}" ] && echo "json_path=${INPUT_JSON_OUT}" >> "$GITHUB_OUTPUT" + [ -n "${INPUT_MD_OUT:-}" ] && echo "md_path=${INPUT_MD_OUT}" >> "$GITHUB_OUTPUT" fi # Append report to job summary if requested @@ -95,4 +57,4 @@ if [ "${INPUT_STEP_SUMMARY:-true}" = "true" ] && [ -n "${GITHUB_STEP_SUMMARY:-}" cat "${INPUT_MD_OUT}" >> "$GITHUB_STEP_SUMMARY" fi -exit ${SLINKY_EXIT_CODE:-0} \ No newline at end of file +exit ${EXIT_CODE:-0} \ No newline at end of file diff --git a/internal/fsurls/fsurls.go b/internal/fsurls/fsurls.go index 0eb2a47..f0ed36e 100644 --- a/internal/fsurls/fsurls.go +++ b/internal/fsurls/fsurls.go @@ -1,7 +1,6 @@ package fsurls import ( - "bufio" "encoding/json" "fmt" "io" @@ -247,22 +246,14 @@ func CollectURLsWithIgnoreConfig(rootPath string, globs []string, respectGitigno return nil } defer f.Close() - br := bufio.NewReader(f) - // Read up to maxSize bytes - var b strings.Builder - read := int64(0) - for { - chunk, cerr := br.ReadString('\n') - b.WriteString(chunk) - read += int64(len(chunk)) - if cerr == io.EOF || read > maxSize { - break - } - if cerr != nil { - break - } + // Read up to maxSize bytes efficiently using LimitReader + limitedReader := io.LimitReader(f, maxSize) + contentBytes, readErr := io.ReadAll(limitedReader) + if readErr != nil { + // Non-critical error: skip file and continue + return nil } - content := b.String() + content := string(contentBytes) // Skip if likely binary (NUL present) if strings.IndexByte(content, '\x00') >= 0 { return nil @@ -428,21 +419,14 @@ func CollectURLsProgressWithIgnoreConfig(rootPath string, globs []string, respec return nil } defer f.Close() - br := bufio.NewReader(f) - var b strings.Builder - read := int64(0) - for { - chunk, cerr := br.ReadString('\n') - b.WriteString(chunk) - read += int64(len(chunk)) - if cerr == io.EOF || read > maxSize { - break - } - if cerr != nil { - break - } + // Read up to maxSize bytes efficiently using LimitReader + limitedReader := io.LimitReader(f, maxSize) + contentBytes, readErr := io.ReadAll(limitedReader) + if readErr != nil { + // Non-critical error: skip file and continue + return nil } - content := b.String() + content := string(contentBytes) if strings.IndexByte(content, '\x00') >= 0 { return nil } diff --git a/internal/web/cache.go b/internal/web/cache.go new file mode 100644 index 0000000..774d410 --- /dev/null +++ b/internal/web/cache.go @@ -0,0 +1,132 @@ +package web + +import ( + "encoding/json" + "fmt" + "os" + "time" +) + +// CacheEntry represents a cached URL check result +type CacheEntry struct { + URL string `json:"url"` + OK bool `json:"ok"` + Status int `json:"status"` + ErrMsg string `json:"error,omitempty"` + Checked time.Time `json:"checked"` +} + +// URLCache manages URL result caching +type URLCache struct { + entries map[string]CacheEntry + ttl time.Duration + path string +} + +// NewURLCache creates a new URL cache with optional file path +func NewURLCache(cachePath string, ttlHours int) *URLCache { + ttl := time.Duration(ttlHours) * time.Hour + if ttl <= 0 { + ttl = 24 * time.Hour // Default 24 hours + } + return &URLCache{ + entries: make(map[string]CacheEntry), + ttl: ttl, + path: cachePath, + } +} + +// Load loads cache entries from file if path is set +func (c *URLCache) Load() error { + if c.path == "" { + return nil // No cache file specified + } + + data, err := os.ReadFile(c.path) + if err != nil { + if os.IsNotExist(err) { + return nil // Cache file doesn't exist yet, that's OK + } + return fmt.Errorf("failed to read cache file: %w", err) + } + + var entries []CacheEntry + if err := json.Unmarshal(data, &entries); err != nil { + // Non-critical: if cache is corrupted, start fresh + return nil + } + + now := time.Now() + c.entries = make(map[string]CacheEntry, len(entries)) + for _, entry := range entries { + // Only load entries that haven't expired + if now.Sub(entry.Checked) < c.ttl { + c.entries[entry.URL] = entry + } + } + + return nil +} + +// Get retrieves a cached result for a URL +func (c *URLCache) Get(url string) (CacheEntry, bool) { + entry, ok := c.entries[url] + if !ok { + return CacheEntry{}, false + } + + // Check if entry has expired + if time.Since(entry.Checked) >= c.ttl { + delete(c.entries, url) + return CacheEntry{}, false + } + + return entry, true +} + +// Set stores a result in the cache +func (c *URLCache) Set(url string, ok bool, status int, errMsg string) { + c.entries[url] = CacheEntry{ + URL: url, + OK: ok, + Status: status, + ErrMsg: errMsg, + Checked: time.Now(), + } +} + +// Save saves cache entries to file if path is set +func (c *URLCache) Save() error { + if c.path == "" { + return nil // No cache file specified + } + + // Convert map to slice for JSON serialization + entries := make([]CacheEntry, 0, len(c.entries)) + for _, entry := range c.entries { + entries = append(entries, entry) + } + + data, err := json.MarshalIndent(entries, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal cache: %w", err) + } + + // Write to temp file first, then rename (atomic write) + tmpPath := c.path + ".tmp" + if err := os.WriteFile(tmpPath, data, 0644); err != nil { + return fmt.Errorf("failed to write cache file: %w", err) + } + + if err := os.Rename(tmpPath, c.path); err != nil { + return fmt.Errorf("failed to rename cache file: %w", err) + } + + return nil +} + +// Clear removes all entries from the cache +func (c *URLCache) Clear() { + c.entries = make(map[string]CacheEntry) +} + diff --git a/internal/web/checker.go b/internal/web/checker.go index d0af2a7..a565e41 100644 --- a/internal/web/checker.go +++ b/internal/web/checker.go @@ -2,9 +2,11 @@ package web import ( "context" + "fmt" "net" "net/http" "sort" + "sync/atomic" "time" ) @@ -13,13 +15,18 @@ import ( func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, out chan<- Result, stats chan<- Stats, cfg Config) { defer close(out) - // Build HTTP client similar to crawler + // Build HTTP client with optimized connection pooling + // Increase MaxIdleConns to handle many unique domains efficiently + maxIdleConns := cfg.MaxConcurrency * 4 + if maxIdleConns < 100 { + maxIdleConns = 100 // Minimum 100 idle connections for better performance across domains + } transport := &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{Timeout: 2 * time.Second, KeepAlive: 30 * time.Second}).DialContext, TLSHandshakeTimeout: 5 * time.Second, ExpectContinueTimeout: 1 * time.Second, - MaxIdleConns: cfg.MaxConcurrency * 2, + MaxIdleConns: maxIdleConns, MaxIdleConnsPerHost: cfg.MaxConcurrency, MaxConnsPerHost: cfg.MaxConcurrency, IdleConnTimeout: 30 * time.Second, @@ -31,16 +38,11 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, jobs := make(chan job, len(urls)) done := make(chan struct{}) - // Seed jobs - unique := make(map[string]struct{}, len(urls)) + // Seed jobs (URLs are already deduplicated in check.go, so no need to deduplicate here) for _, u := range urls { if u == "" { continue } - if _, ok := unique[u]; ok { - continue - } - unique[u] = struct{}{} jobs <- job{url: u} } close(jobs) @@ -49,8 +51,9 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, if concurrency <= 0 { concurrency = 8 } - processed := 0 - pending := len(unique) + // Use atomic counters to avoid race conditions + var processed int64 + var pending int64 = int64(len(urls)) worker := func() { for j := range jobs { @@ -59,15 +62,47 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, return default: } - ok, status, resp, err := fetchWithMethod(ctx, client, http.MethodGet, j.url) - if resp != nil && resp.Body != nil { - resp.Body.Close() + + var ok bool + var status int + var err error + var cacheHit bool + + // Check cache first if available + if cfg.Cache != nil { + if cached, found := cfg.Cache.Get(j.url); found { + ok = cached.OK + status = cached.Status + err = nil + if cached.ErrMsg != "" { + err = fmt.Errorf("%s", cached.ErrMsg) + } + cacheHit = true + } } - // Treat 401/403/408/429 as valid links - if status == http.StatusUnauthorized || status == http.StatusForbidden || status == http.StatusRequestTimeout || status == http.StatusTooManyRequests { - ok = true - err = nil + + // If not cached, fetch from network + if !cacheHit { + var resp *http.Response + ok, status, resp, err = fetchWithMethod(ctx, client, http.MethodGet, j.url) + if resp != nil && resp.Body != nil { + resp.Body.Close() + } + // Status code handling is now done in fetchWithMethod: + // - 200-299, 401, 403 are OK (page exists) + // - 404, DNS errors, connection refused are bad (flagged) + // - 408, 429, 5xx are retried then flagged if still failing + + // Store in cache + if cfg.Cache != nil { + errMsg := "" + if err != nil { + errMsg = err.Error() + } + cfg.Cache.Set(j.url, ok, status, errMsg) + } } + // Check context before sending result select { case <-ctx.Done(): @@ -82,16 +117,17 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, // Send result with context check select { - case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs)}: + case out <- Result{URL: j.url, OK: ok, Status: status, Err: err, ErrMsg: errString(err), Method: http.MethodGet, Sources: cloneAndSort(srcs), CacheHit: cacheHit}: case <-ctx.Done(): return } - processed++ - pending-- + // Atomically update counters + proc := atomic.AddInt64(&processed, 1) + pend := atomic.AddInt64(&pending, -1) if stats != nil { select { - case stats <- Stats{Pending: pending, Processed: processed}: + case stats <- Stats{Pending: int(pend), Processed: int(proc)}: default: } } diff --git a/internal/web/http.go b/internal/web/http.go index e020a0b..127ebf0 100644 --- a/internal/web/http.go +++ b/internal/web/http.go @@ -3,34 +3,182 @@ package web import ( "context" "errors" + "fmt" "net" "net/http" + "net/url" "strings" + "time" ) const browserUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36" +const maxRedirects = 10 +const maxRetries = 3 +// fetchWithMethod performs HTTP request with retry logic and redirect following. +// Returns: (isOK, statusCode, response, error) +// Status code handling: +// - OK (don't flag): 200-299, 401 (Unauthorized), 403 (Forbidden) - page exists +// - Bad (flag): 404 (Not Found), DNS errors, connection refused +// - Retry then flag: 408 (Timeout), 429 (Rate Limited), 5xx (Server Errors) func fetchWithMethod(ctx context.Context, client *http.Client, method string, raw string) (bool, int, *http.Response, error) { - req, err := http.NewRequestWithContext(ctx, method, raw, nil) + var lastErr error + var lastResp *http.Response + + // Retry logic for transient errors + for attempt := 0; attempt <= maxRetries; attempt++ { + if attempt > 0 { + // Exponential backoff: 1s, 2s, 4s + backoff := time.Duration(1< maxRedirects { + return resp, resp.StatusCode, simpleError("too many redirects") + } + + status := resp.StatusCode + if status < 300 || status >= 400 { + // Not a redirect + return resp, status, nil + } + + // Handle redirect + location := resp.Header.Get("Location") + if location == "" { + return resp, status, nil + } + + // Resolve relative URLs + baseURL, err := url.Parse(originalURL) + if err != nil { + return resp, status, err + } + redirectURL, err := baseURL.Parse(location) + if err != nil { + return resp, status, err + } + + // Check for redirect loop (simple check: same URL) + if redirectURL.String() == originalURL { + return resp, status, simpleError("redirect loop detected") + } + + // Close previous response body + if resp.Body != nil { + resp.Body.Close() + } + + // Follow redirect + req, err := http.NewRequestWithContext(ctx, "GET", redirectURL.String(), nil) if err != nil { - return false, 0, nil, err + return resp, status, err } req.Header.Set("User-Agent", browserUA) req.Header.Set("Accept", "*/*") - resp, err := client.Do(req) + + newResp, err := client.Do(req) if err != nil { - if isDNSError(err) { - return false, 404, nil, simpleError("host not found") - } - if isTimeout(err) { - return false, 408, nil, simpleError("request timeout") - } - if isRefused(err) { - return false, 503, nil, simpleError("connection refused") - } - return false, 0, nil, err + return resp, status, err } - return resp.StatusCode >= 200 && resp.StatusCode < 400, resp.StatusCode, resp, nil + + // Recursively follow redirects + return followRedirects(ctx, client, newResp, redirectURL.String(), depth+1) +} + +// isOKStatus determines if a status code indicates the link is valid. +// 200-299: Success +// 401: Unauthorized (page exists, just requires auth) +// 403: Forbidden (page exists, just requires permissions) +func isOKStatus(status int) bool { + if status >= 200 && status < 300 { + return true + } + if status == http.StatusUnauthorized || status == http.StatusForbidden { + return true + } + return false +} + +// isRetryableStatus determines if a status code should trigger a retry. +func isRetryableStatus(status int) bool { + return status == http.StatusRequestTimeout || // 408 + status == http.StatusTooManyRequests || // 429 + status >= 500 // 5xx server errors } func errString(e error) string { diff --git a/internal/web/types.go b/internal/web/types.go index f5bbdc1..f27574f 100644 --- a/internal/web/types.go +++ b/internal/web/types.go @@ -26,4 +26,5 @@ type Config struct { RequestTimeout time.Duration MaxRetries429 int Exclude []string + Cache *URLCache // Optional URL result cache } From eb6a7a4366b86abe27623a8a3a657a60696ed9fd Mon Sep 17 00:00:00 2001 From: Luke Hagar Date: Fri, 14 Nov 2025 20:36:58 +0000 Subject: [PATCH 2/6] Update action.yml and check.go to clarify PR comment behavior Enhance the description for the 'comment_pr' input in action.yml to specify that it defaults to true when GITHUB_TOKEN is present. In check.go, update the logic to explicitly disable PR commenting only when the input is set to "false", improving clarity on the commenting behavior during PR checks. --- action.yml | 2 +- cmd/check.go | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/action.yml b/action.yml index b835ef2..4855780 100644 --- a/action.yml +++ b/action.yml @@ -32,7 +32,7 @@ inputs: description: "Fail the job if any links fail" required: false comment_pr: - description: "If running on a PR, post a comment with the report" + description: "If running on a PR, post a comment with the report. Default: true (enabled when GITHUB_TOKEN is present)" required: false step_summary: description: "Append the report to the GitHub Step Summary" diff --git a/cmd/check.go b/cmd/check.go index 5fa8673..09f796a 100644 --- a/cmd/check.go +++ b/cmd/check.go @@ -317,11 +317,14 @@ func init() { } // If running on a PR, post or update the comment(s), chunking as needed - // Check if PR commenting is enabled (default to true if not set) - commentPR := true + // PR comments are enabled by default when token is present + // Only disable if explicitly set to "false" + commentPR := true // Default: enabled if val := os.Getenv("INPUT_COMMENT_PR"); val != "" { - commentPR = strings.EqualFold(val, "true") + // Explicitly check for "false" to disable, everything else enables + commentPR = !strings.EqualFold(strings.TrimSpace(val), "false") } + // Only post comments if: GitHub PR detected, commenting enabled, and report exists if ghOK && commentPR && strings.TrimSpace(finalMDPath) != "" { b, rerr := os.ReadFile(finalMDPath) if rerr != nil { From e1295a63bf0172a950ccc44fcbe2eb695c4066ee Mon Sep 17 00:00:00 2001 From: Luke Hagar Date: Fri, 14 Nov 2025 20:38:56 +0000 Subject: [PATCH 3/6] Update action.yml, README.md, and example workflow to clarify GITHUB_TOKEN usage This commit modifies action.yml to remove the direct reference to GITHUB_TOKEN and updates the README.md to specify that it should be accessed via secrets.GITHUB_TOKEN. Additionally, the example workflow is updated to include the GITHUB_TOKEN in the environment variables, ensuring clarity on its necessity for PR comment functionality. --- .github/workflows/example-slinky.yml | 2 ++ README.md | 6 ++++-- action.yml | 3 --- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/example-slinky.yml b/.github/workflows/example-slinky.yml index 222527a..34384c6 100644 --- a/.github/workflows/example-slinky.yml +++ b/.github/workflows/example-slinky.yml @@ -17,6 +17,8 @@ jobs: - name: Run Slinky link checker uses: ./ + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Upload results if: always() diff --git a/README.md b/README.md index 4f3123e..11211ae 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,14 @@ jobs: - uses: actions/checkout@v4 - name: Run Slinky uses: LukeHagar/slinky@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Required for PR comments with: targets: "docs/,README.md,**/*.md" - # comment-pr: true # Optional: post results as PR comment (requires GITHUB_TOKEN) + # comment-pr: true # Optional: post results as PR comment (default: true) ``` -**Note:** The `GITHUB_TOKEN` is automatically provided by GitHub Actions and is only required for PR comment functionality. Core link checking works without it. If you disable PR comments (`comment-pr: false`), you can remove the `pull-requests: write` permission. +**Note:** The `GITHUB_TOKEN` is automatically provided by GitHub Actions via `secrets.GITHUB_TOKEN` and is only required for PR comment functionality. Core link checking works without it. If you disable PR comments (`comment-pr: false`), you can remove the `pull-requests: write` permission and the `GITHUB_TOKEN` env variable. ### Inputs diff --git a/action.yml b/action.yml index 4855780..8f1c5b8 100644 --- a/action.yml +++ b/action.yml @@ -53,12 +53,9 @@ runs: INPUT_FAIL_ON_FAILURES: ${{ inputs.fail_on_failures }} INPUT_COMMENT_PR: ${{ inputs.comment_pr }} INPUT_STEP_SUMMARY: ${{ inputs.step_summary }} - GITHUB_TOKEN: ${{ github.token }} outputs: json_path: description: "Path to JSON results file" md_path: description: "Path to Markdown report file" - - From 27718b695e44c3d9be08313cf307af33271c9531 Mon Sep 17 00:00:00 2001 From: Luke Hagar Date: Fri, 14 Nov 2025 20:52:38 +0000 Subject: [PATCH 4/6] Refactor action.yml input names for consistency and update example workflow This commit updates the input names in action.yml to use hyphenated formats for better consistency and clarity. Additionally, the example workflow is modified to demonstrate the new input names and to showcase the functionality of disabling blocking failures and enabling PR comments. --- .github/workflows/example-slinky.yml | 7 ++++++- action.yml | 28 ++++++++++++++-------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/.github/workflows/example-slinky.yml b/.github/workflows/example-slinky.yml index 34384c6..4dab036 100644 --- a/.github/workflows/example-slinky.yml +++ b/.github/workflows/example-slinky.yml @@ -15,10 +15,15 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Run Slinky link checker + - name: Run Slinky link checker (Example - failures won't block) uses: ./ env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + fail-on-failures: false # Disable blocking failures for example/demo purposes + comment-pr: true # Enable PR comments to showcase the feature + md-out: results.md + json-out: results.json - name: Upload results if: always() diff --git a/action.yml b/action.yml index 8f1c5b8..56960ab 100644 --- a/action.yml +++ b/action.yml @@ -16,25 +16,25 @@ inputs: timeout: description: "HTTP timeout seconds" required: false - respect_gitignore: + respect-gitignore: description: "Respect .gitignore while scanning" required: false - json_out: + json-out: description: "Optional path to write JSON results" required: false - md_out: + md-out: description: "Optional path to write Markdown report for PR comment" required: false - repo_blob_base: + repo-blob-base: description: "Override GitHub blob base URL (https://github.com///blob/)" required: false - fail_on_failures: + fail-on-failures: description: "Fail the job if any links fail" required: false - comment_pr: + comment-pr: description: "If running on a PR, post a comment with the report. Default: true (enabled when GITHUB_TOKEN is present)" required: false - step_summary: + step-summary: description: "Append the report to the GitHub Step Summary" required: false @@ -46,13 +46,13 @@ runs: INPUT_TARGETS: ${{ inputs.targets }} INPUT_CONCURRENCY: ${{ inputs.concurrency }} INPUT_TIMEOUT: ${{ inputs.timeout }} - INPUT_RESPECT_GITIGNORE: ${{ inputs.respect_gitignore }} - INPUT_JSON_OUT: ${{ inputs.json_out }} - INPUT_MD_OUT: ${{ inputs.md_out }} - INPUT_REPO_BLOB_BASE: ${{ inputs.repo_blob_base }} - INPUT_FAIL_ON_FAILURES: ${{ inputs.fail_on_failures }} - INPUT_COMMENT_PR: ${{ inputs.comment_pr }} - INPUT_STEP_SUMMARY: ${{ inputs.step_summary }} + INPUT_RESPECT_GITIGNORE: ${{ inputs.respect-gitignore }} + INPUT_JSON_OUT: ${{ inputs.json-out }} + INPUT_MD_OUT: ${{ inputs.md-out }} + INPUT_REPO_BLOB_BASE: ${{ inputs.repo-blob-base }} + INPUT_FAIL_ON_FAILURES: ${{ inputs.fail-on-failures }} + INPUT_COMMENT_PR: ${{ inputs.comment-pr }} + INPUT_STEP_SUMMARY: ${{ inputs.step-summary }} outputs: json_path: From 241246499dd4697d963cf895b48ae3fd0a8bcd42 Mon Sep 17 00:00:00 2001 From: Luke Hagar Date: Fri, 14 Nov 2025 20:57:29 +0000 Subject: [PATCH 5/6] Implement mutex for thread-safe URLCache operations and optimize job handling in CheckURLs function This commit introduces a mutex to the URLCache struct to ensure thread-safe access to cache entries during load, get, set, and clear operations. Additionally, it refines the job handling logic in the CheckURLs function by using atomic counters for processed and pending jobs, improving concurrency management. --- internal/web/cache.go | 14 ++++++++++++++ internal/web/checker.go | 11 ++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/internal/web/cache.go b/internal/web/cache.go index 774d410..024e705 100644 --- a/internal/web/cache.go +++ b/internal/web/cache.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "os" + "sync" "time" ) @@ -18,6 +19,7 @@ type CacheEntry struct { // URLCache manages URL result caching type URLCache struct { + mu sync.RWMutex entries map[string]CacheEntry ttl time.Duration path string @@ -57,6 +59,8 @@ func (c *URLCache) Load() error { } now := time.Now() + c.mu.Lock() + defer c.mu.Unlock() c.entries = make(map[string]CacheEntry, len(entries)) for _, entry := range entries { // Only load entries that haven't expired @@ -70,14 +74,18 @@ func (c *URLCache) Load() error { // Get retrieves a cached result for a URL func (c *URLCache) Get(url string) (CacheEntry, bool) { + c.mu.RLock() entry, ok := c.entries[url] + c.mu.RUnlock() if !ok { return CacheEntry{}, false } // Check if entry has expired if time.Since(entry.Checked) >= c.ttl { + c.mu.Lock() delete(c.entries, url) + c.mu.Unlock() return CacheEntry{}, false } @@ -86,6 +94,8 @@ func (c *URLCache) Get(url string) (CacheEntry, bool) { // Set stores a result in the cache func (c *URLCache) Set(url string, ok bool, status int, errMsg string) { + c.mu.Lock() + defer c.mu.Unlock() c.entries[url] = CacheEntry{ URL: url, OK: ok, @@ -102,10 +112,12 @@ func (c *URLCache) Save() error { } // Convert map to slice for JSON serialization + c.mu.RLock() entries := make([]CacheEntry, 0, len(c.entries)) for _, entry := range c.entries { entries = append(entries, entry) } + c.mu.RUnlock() data, err := json.MarshalIndent(entries, "", " ") if err != nil { @@ -127,6 +139,8 @@ func (c *URLCache) Save() error { // Clear removes all entries from the cache func (c *URLCache) Clear() { + c.mu.Lock() + defer c.mu.Unlock() c.entries = make(map[string]CacheEntry) } diff --git a/internal/web/checker.go b/internal/web/checker.go index a565e41..26dfcc2 100644 --- a/internal/web/checker.go +++ b/internal/web/checker.go @@ -38,12 +38,18 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, jobs := make(chan job, len(urls)) done := make(chan struct{}) + // Use atomic counters to avoid race conditions + var processed int64 + var pending int64 + var jobCount int64 + // Seed jobs (URLs are already deduplicated in check.go, so no need to deduplicate here) for _, u := range urls { if u == "" { continue } jobs <- job{url: u} + jobCount++ } close(jobs) @@ -51,9 +57,8 @@ func CheckURLs(ctx context.Context, urls []string, sources map[string][]string, if concurrency <= 0 { concurrency = 8 } - // Use atomic counters to avoid race conditions - var processed int64 - var pending int64 = int64(len(urls)) + // Set pending to actual number of jobs enqueued + pending = jobCount worker := func() { for j := range jobs { From 14b02cbc45aa83294073b7965a6accae9cf0e2eb Mon Sep 17 00:00:00 2001 From: Luke Hagar Date: Fri, 14 Nov 2025 21:00:34 +0000 Subject: [PATCH 6/6] Refactor argument handling in entrypoint.sh for improved clarity and robustness This commit modifies the argument construction in entrypoint.sh to use 'set --' for better handling of optional flags and targets. This change ensures that arguments are properly quoted, accommodating spaces and enhancing overall script reliability. Additionally, debug output is updated to display the constructed command more clearly. --- entrypoint.sh | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/entrypoint.sh b/entrypoint.sh index f782397..dc8307e 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -13,37 +13,43 @@ elif [ -n "${GITHUB_REPOSITORY:-}" ]; then [ -n "$COMMIT_SHA" ] && export SLINKY_REPO_BLOB_BASE_URL="https://github.com/${GITHUB_REPOSITORY}/blob/${COMMIT_SHA}" fi -# Build slinky command arguments -ARGS="check" +# Build slinky command arguments using set -- for proper argument handling +set -- check -# Optional flags -[ -n "${INPUT_CONCURRENCY:-}" ] && ARGS="$ARGS --concurrency ${INPUT_CONCURRENCY}" -[ -n "${INPUT_TIMEOUT:-}" ] && ARGS="$ARGS --timeout ${INPUT_TIMEOUT}" -[ -n "${INPUT_JSON_OUT:-}" ] && ARGS="$ARGS --json-out ${INPUT_JSON_OUT}" -[ -n "${INPUT_MD_OUT:-}" ] && ARGS="$ARGS --md-out ${INPUT_MD_OUT}" -[ -n "${INPUT_REPO_BLOB_BASE:-}" ] && ARGS="$ARGS --repo-blob-base ${INPUT_REPO_BLOB_BASE}" +# Optional flags (properly quoted to handle spaces) +[ -n "${INPUT_CONCURRENCY:-}" ] && set -- "$@" --concurrency "${INPUT_CONCURRENCY}" +[ -n "${INPUT_TIMEOUT:-}" ] && set -- "$@" --timeout "${INPUT_TIMEOUT}" +[ -n "${INPUT_JSON_OUT:-}" ] && set -- "$@" --json-out "${INPUT_JSON_OUT}" +[ -n "${INPUT_MD_OUT:-}" ] && set -- "$@" --md-out "${INPUT_MD_OUT}" +[ -n "${INPUT_REPO_BLOB_BASE:-}" ] && set -- "$@" --repo-blob-base "${INPUT_REPO_BLOB_BASE}" # Boolean flags with defaults -[ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ] && ARGS="$ARGS --fail-on-failures=true" || ARGS="$ARGS --fail-on-failures=false" -[ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ] && ARGS="$ARGS --respect-gitignore=true" || ARGS="$ARGS --respect-gitignore=false" +[ "${INPUT_FAIL_ON_FAILURES:-true}" = "true" ] && set -- "$@" --fail-on-failures=true || set -- "$@" --fail-on-failures=false +[ "${INPUT_RESPECT_GITIGNORE:-true}" = "true" ] && set -- "$@" --respect-gitignore=true || set -- "$@" --respect-gitignore=false # Add targets (comma-separated glob patterns) if [ -n "${INPUT_TARGETS:-}" ]; then IFS=',' for target in $INPUT_TARGETS; do target=$(echo "$target" | xargs) - [ -n "$target" ] && ARGS="$ARGS $target" + [ -n "$target" ] && set -- "$@" "$target" done unset IFS else - ARGS="$ARGS **/*" + set -- "$@" "**/*" fi # Debug output -[ "${ACTIONS_STEP_DEBUG:-}" = "true" ] && printf "::debug:: Running: slinky %s\n" "$ARGS" +if [ "${ACTIONS_STEP_DEBUG:-}" = "true" ]; then + printf "::debug:: Running: slinky" + for arg in "$@"; do + printf " %s" "$arg" + done + printf "\n" +fi # Execute slinky (crawl repo with glob input, filter via .slinkignore) -slinky $ARGS +slinky "$@" EXIT_CODE=$? # Expose outputs