Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 56 additions & 7 deletions p2p/p2p_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ func (m *p2pStatsManager) Stats(ctx context.Context, p *p2p) (*StatsSnapshot, er
}

prev := m.getSnapshot()
if prev == nil {
next, _ := m.refreshDiagnosticsSync(ctx, p)
if next != nil {
prev = next
} else {
prev = m.getSnapshot()
}
}
snap := cloneSnapshot(prev)
snap.PeersCount = peersCount
// Store a separate struct instance in the cache to avoid aliasing with the returned snapshot,
Expand All @@ -128,6 +136,45 @@ func (m *p2pStatsManager) Stats(ctx context.Context, p *p2p) (*StatsSnapshot, er
return snap, nil
}

func (m *p2pStatsManager) refreshDiagnosticsSync(ctx context.Context, p *p2p) (*StatsSnapshot, error) {
if m == nil || p == nil {
return nil, nil
}
if !m.refreshInFlight.CompareAndSwap(false, true) {
return nil, nil
}

defer m.refreshInFlight.Store(false)
start := time.Now()
refreshCtx, cancel := context.WithTimeout(context.Background(), p2pStatsRefreshTimeout)
next, err := m.collectDiagnostics(refreshCtx, p, m.getSnapshot())
cancel()
dur := time.Since(start)

if next != nil {
m.setSnapshot(next)
m.markFresh()
}

if err != nil {
logtrace.Warn(ctx, "p2p stats diagnostics initial refresh failed", logtrace.Fields{
logtrace.FieldModule: "p2p",
"refresh": "diagnostics",
"ms": dur.Milliseconds(),
logtrace.FieldError: err.Error(),
})
}
if dur > p2pStatsSlowRefreshThreshold {
logtrace.Warn(ctx, "p2p stats diagnostics initial refresh slow", logtrace.Fields{
logtrace.FieldModule: "p2p",
"refresh": "diagnostics",
"ms": dur.Milliseconds(),
})
}

return next, err
}

func (m *p2pStatsManager) maybeRefreshDiagnostics(ctx context.Context, p *p2p) {
if m == nil || p == nil {
return
Expand All @@ -142,10 +189,15 @@ func (m *p2pStatsManager) maybeRefreshDiagnostics(ctx context.Context, p *p2p) {

start := time.Now()
refreshCtx, cancel := context.WithTimeout(context.Background(), p2pStatsRefreshTimeout)
err := m.refreshDiagnostics(refreshCtx, p)
next, err := m.collectDiagnostics(refreshCtx, p, m.getSnapshot())
cancel()
dur := time.Since(start)

if next != nil {
m.setSnapshot(next)
m.markFresh()
}

if err != nil {
logtrace.Warn(logCtx, "p2p stats diagnostics refresh failed", logtrace.Fields{
logtrace.FieldModule: "p2p",
Expand All @@ -164,12 +216,11 @@ func (m *p2pStatsManager) maybeRefreshDiagnostics(ctx context.Context, p *p2p) {
}()
}

func (m *p2pStatsManager) refreshDiagnostics(ctx context.Context, p *p2p) error {
func (m *p2pStatsManager) collectDiagnostics(ctx context.Context, p *p2p, prev *StatsSnapshot) (*StatsSnapshot, error) {
if err := ctx.Err(); err != nil {
return err
return nil, err
}

prev := m.getSnapshot()
next := cloneSnapshot(prev)

var refreshErr error
Expand Down Expand Up @@ -203,9 +254,7 @@ func (m *p2pStatsManager) refreshDiagnostics(ctx context.Context, p *p2p) error
}
}

m.setSnapshot(next)
m.markFresh()
return refreshErr
return next, refreshErr
}

func cloneSnapshot(in *StatsSnapshot) *StatsSnapshot {
Expand Down
38 changes: 37 additions & 1 deletion supernode/status/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package status

import (
"context"
"math"
"time"

"github.com/LumeraProtocol/supernode/v2/pkg/logtrace"
Expand All @@ -10,6 +11,21 @@ import (
"github.com/shirou/gopsutil/v3/mem"
)

// diskSizeAdjustFactor compensates for observed discrepancies between the disk
// size reported by the node runtime and the "expected" decimal-GB figure used by
// external consumers (dashboards/on-chain metrics).
//
// Rationale:
// - Keep the adjustment in exactly one place (the status metrics source) so all
// downstream consumers remain consistent.
// - Apply it to both total and free to preserve internal consistency between
// total/free/usage%.
const diskSizeAdjustFactor = 1.1

func adjustDiskBytes(value uint64) uint64 {
return uint64(math.Round(float64(value) * diskSizeAdjustFactor))
}

// MetricsCollector handles system resource monitoring
type MetricsCollector struct{}

Expand Down Expand Up @@ -60,14 +76,34 @@ func (m *MetricsCollector) CollectStorageMetrics(ctx context.Context, paths []st
if len(paths) == 0 {
paths = []string{"/"}
}
// Note: callers may request multiple paths, but higher-level services report
// only the first volume to keep node metrics stable and comparable across
// environments (host vs container overlays, multiple mount points, etc.).
var storageInfos []StorageInfo
for _, path := range paths {
usage, err := disk.Usage(path)
if err != nil {
logtrace.Error(ctx, "failed to get storage info", logtrace.Fields{logtrace.FieldError: err.Error(), "path": path})
continue
}
storageInfos = append(storageInfos, StorageInfo{Path: path, TotalBytes: usage.Total, UsedBytes: usage.Used, AvailableBytes: usage.Free, UsagePercent: usage.UsedPercent})
totalBytes := adjustDiskBytes(usage.Total)
availableBytes := adjustDiskBytes(usage.Free)
if availableBytes > totalBytes {
availableBytes = totalBytes
}
usedBytes := totalBytes - availableBytes
usagePercent := 0.0
if totalBytes > 0 {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}

storageInfos = append(storageInfos, StorageInfo{
Path: path,
TotalBytes: totalBytes,
UsedBytes: usedBytes,
AvailableBytes: availableBytes,
UsagePercent: usagePercent,
})
}
return storageInfos
}
7 changes: 6 additions & 1 deletion supernode/status/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,12 @@ func (s *SupernodeStatusService) GetStatus(ctx context.Context, includeP2PMetric
resp.Resources.HardwareSummary = fmt.Sprintf("%d cores / %.0fGB RAM", cores, resp.Resources.Memory.TotalGb)
}
// Storage metrics
for _, si := range s.metrics.CollectStorageMetrics(ctx, s.storagePaths) {
if storageInfos := s.metrics.CollectStorageMetrics(ctx, s.storagePaths); len(storageInfos) > 0 {
// Rationale: report only the first volume everywhere (status + on-chain
// metrics) to avoid ambiguity across environments where multiple mounts
// exist (e.g. container overlay + host filesystem). The configured default
// is "/" so this remains stable.
si := storageInfos[0]
resp.Resources.StorageVolumes = append(resp.Resources.StorageVolumes, &pb.StatusResponse_Resources_Storage{
Path: si.Path,
TotalBytes: si.TotalBytes,
Expand Down
3 changes: 3 additions & 0 deletions supernode/supernode_metrics/metrics_collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ func (hm *Collector) collectMetrics(ctx context.Context) (sntypes.SupernodeMetri
}

if statusResp.Resources != nil && len(statusResp.Resources.StorageVolumes) > 0 {
// Storage is sourced from the status service. Any disk-size adjustment
// must happen there (single source of truth) so status + on-chain metrics
// stay consistent.
storage := statusResp.Resources.StorageVolumes[0] // 9–11: first volume is reported
const bytesToGB = 1024.0 * 1024.0 * 1024.0

Expand Down
Loading