diff --git a/p2p/p2p_stats.go b/p2p/p2p_stats.go index 49e1bcaa..f406404b 100644 --- a/p2p/p2p_stats.go +++ b/p2p/p2p_stats.go @@ -114,6 +114,14 @@ func (m *p2pStatsManager) Stats(ctx context.Context, p *p2p) (*StatsSnapshot, er } prev := m.getSnapshot() + if prev == nil { + next, _ := m.refreshDiagnosticsSync(ctx, p) + if next != nil { + prev = next + } else { + prev = m.getSnapshot() + } + } snap := cloneSnapshot(prev) snap.PeersCount = peersCount // Store a separate struct instance in the cache to avoid aliasing with the returned snapshot, @@ -128,6 +136,45 @@ func (m *p2pStatsManager) Stats(ctx context.Context, p *p2p) (*StatsSnapshot, er return snap, nil } +func (m *p2pStatsManager) refreshDiagnosticsSync(ctx context.Context, p *p2p) (*StatsSnapshot, error) { + if m == nil || p == nil { + return nil, nil + } + if !m.refreshInFlight.CompareAndSwap(false, true) { + return nil, nil + } + + defer m.refreshInFlight.Store(false) + start := time.Now() + refreshCtx, cancel := context.WithTimeout(context.Background(), p2pStatsRefreshTimeout) + next, err := m.collectDiagnostics(refreshCtx, p, m.getSnapshot()) + cancel() + dur := time.Since(start) + + if next != nil { + m.setSnapshot(next) + m.markFresh() + } + + if err != nil { + logtrace.Warn(ctx, "p2p stats diagnostics initial refresh failed", logtrace.Fields{ + logtrace.FieldModule: "p2p", + "refresh": "diagnostics", + "ms": dur.Milliseconds(), + logtrace.FieldError: err.Error(), + }) + } + if dur > p2pStatsSlowRefreshThreshold { + logtrace.Warn(ctx, "p2p stats diagnostics initial refresh slow", logtrace.Fields{ + logtrace.FieldModule: "p2p", + "refresh": "diagnostics", + "ms": dur.Milliseconds(), + }) + } + + return next, err +} + func (m *p2pStatsManager) maybeRefreshDiagnostics(ctx context.Context, p *p2p) { if m == nil || p == nil { return @@ -142,10 +189,15 @@ func (m *p2pStatsManager) maybeRefreshDiagnostics(ctx context.Context, p *p2p) { start := time.Now() refreshCtx, cancel := context.WithTimeout(context.Background(), p2pStatsRefreshTimeout) - err := m.refreshDiagnostics(refreshCtx, p) + next, err := m.collectDiagnostics(refreshCtx, p, m.getSnapshot()) cancel() dur := time.Since(start) + if next != nil { + m.setSnapshot(next) + m.markFresh() + } + if err != nil { logtrace.Warn(logCtx, "p2p stats diagnostics refresh failed", logtrace.Fields{ logtrace.FieldModule: "p2p", @@ -164,12 +216,11 @@ func (m *p2pStatsManager) maybeRefreshDiagnostics(ctx context.Context, p *p2p) { }() } -func (m *p2pStatsManager) refreshDiagnostics(ctx context.Context, p *p2p) error { +func (m *p2pStatsManager) collectDiagnostics(ctx context.Context, p *p2p, prev *StatsSnapshot) (*StatsSnapshot, error) { if err := ctx.Err(); err != nil { - return err + return nil, err } - prev := m.getSnapshot() next := cloneSnapshot(prev) var refreshErr error @@ -203,9 +254,7 @@ func (m *p2pStatsManager) refreshDiagnostics(ctx context.Context, p *p2p) error } } - m.setSnapshot(next) - m.markFresh() - return refreshErr + return next, refreshErr } func cloneSnapshot(in *StatsSnapshot) *StatsSnapshot { diff --git a/supernode/status/metrics.go b/supernode/status/metrics.go index ff29d100..3263ded3 100644 --- a/supernode/status/metrics.go +++ b/supernode/status/metrics.go @@ -2,6 +2,7 @@ package status import ( "context" + "math" "time" "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" @@ -10,6 +11,21 @@ import ( "github.com/shirou/gopsutil/v3/mem" ) +// diskSizeAdjustFactor compensates for observed discrepancies between the disk +// size reported by the node runtime and the "expected" decimal-GB figure used by +// external consumers (dashboards/on-chain metrics). +// +// Rationale: +// - Keep the adjustment in exactly one place (the status metrics source) so all +// downstream consumers remain consistent. +// - Apply it to both total and free to preserve internal consistency between +// total/free/usage%. +const diskSizeAdjustFactor = 1.1 + +func adjustDiskBytes(value uint64) uint64 { + return uint64(math.Round(float64(value) * diskSizeAdjustFactor)) +} + // MetricsCollector handles system resource monitoring type MetricsCollector struct{} @@ -60,6 +76,9 @@ func (m *MetricsCollector) CollectStorageMetrics(ctx context.Context, paths []st if len(paths) == 0 { paths = []string{"/"} } + // Note: callers may request multiple paths, but higher-level services report + // only the first volume to keep node metrics stable and comparable across + // environments (host vs container overlays, multiple mount points, etc.). var storageInfos []StorageInfo for _, path := range paths { usage, err := disk.Usage(path) @@ -67,7 +86,24 @@ func (m *MetricsCollector) CollectStorageMetrics(ctx context.Context, paths []st logtrace.Error(ctx, "failed to get storage info", logtrace.Fields{logtrace.FieldError: err.Error(), "path": path}) continue } - storageInfos = append(storageInfos, StorageInfo{Path: path, TotalBytes: usage.Total, UsedBytes: usage.Used, AvailableBytes: usage.Free, UsagePercent: usage.UsedPercent}) + totalBytes := adjustDiskBytes(usage.Total) + availableBytes := adjustDiskBytes(usage.Free) + if availableBytes > totalBytes { + availableBytes = totalBytes + } + usedBytes := totalBytes - availableBytes + usagePercent := 0.0 + if totalBytes > 0 { + usagePercent = float64(usedBytes) / float64(totalBytes) * 100 + } + + storageInfos = append(storageInfos, StorageInfo{ + Path: path, + TotalBytes: totalBytes, + UsedBytes: usedBytes, + AvailableBytes: availableBytes, + UsagePercent: usagePercent, + }) } return storageInfos } diff --git a/supernode/status/service.go b/supernode/status/service.go index 33faef1d..f7dab3ae 100644 --- a/supernode/status/service.go +++ b/supernode/status/service.go @@ -84,7 +84,12 @@ func (s *SupernodeStatusService) GetStatus(ctx context.Context, includeP2PMetric resp.Resources.HardwareSummary = fmt.Sprintf("%d cores / %.0fGB RAM", cores, resp.Resources.Memory.TotalGb) } // Storage metrics - for _, si := range s.metrics.CollectStorageMetrics(ctx, s.storagePaths) { + if storageInfos := s.metrics.CollectStorageMetrics(ctx, s.storagePaths); len(storageInfos) > 0 { + // Rationale: report only the first volume everywhere (status + on-chain + // metrics) to avoid ambiguity across environments where multiple mounts + // exist (e.g. container overlay + host filesystem). The configured default + // is "/" so this remains stable. + si := storageInfos[0] resp.Resources.StorageVolumes = append(resp.Resources.StorageVolumes, &pb.StatusResponse_Resources_Storage{ Path: si.Path, TotalBytes: si.TotalBytes, diff --git a/supernode/supernode_metrics/metrics_collection.go b/supernode/supernode_metrics/metrics_collection.go index f5b795c6..254780ec 100644 --- a/supernode/supernode_metrics/metrics_collection.go +++ b/supernode/supernode_metrics/metrics_collection.go @@ -48,6 +48,9 @@ func (hm *Collector) collectMetrics(ctx context.Context) (sntypes.SupernodeMetri } if statusResp.Resources != nil && len(statusResp.Resources.StorageVolumes) > 0 { + // Storage is sourced from the status service. Any disk-size adjustment + // must happen there (single source of truth) so status + on-chain metrics + // stay consistent. storage := statusResp.Resources.StorageVolumes[0] // 9–11: first volume is reported const bytesToGB = 1024.0 * 1024.0 * 1024.0