diff --git a/cmd/proxsave/main.go b/cmd/proxsave/main.go index 4e9fc18..45881fa 100644 --- a/cmd/proxsave/main.go +++ b/cmd/proxsave/main.go @@ -603,6 +603,15 @@ func run() int { } }() + // Defer for network rollback countdown (LIFO: executes BEFORE footer) + defer func() { + if finalExitCode == exitCodeInterrupted { + if abortInfo := orchestrator.GetLastRestoreAbortInfo(); abortInfo != nil { + printNetworkRollbackCountdown(abortInfo) + } + } + }() + defer func() { if !args.Support || pendingSupportStats == nil { return @@ -746,7 +755,7 @@ func run() int { logging.Info("Restore mode enabled - starting CLI workflow...") if err := orchestrator.RunRestoreWorkflow(ctx, cfg, logger, toolVersion); err != nil { if errors.Is(err, orchestrator.ErrRestoreAborted) { - logging.Info("Restore workflow aborted by user") + logging.Warning("Restore workflow aborted by user") if args.Support { pendingSupportStats = support.BuildSupportStats(logger, resolveHostname(), envInfo.Type, envInfo.Version, toolVersion, startTime, time.Now(), exitCodeInterrupted, "restore") } @@ -776,7 +785,7 @@ func run() int { } if err := orchestrator.RunRestoreWorkflowTUI(ctx, cfg, logger, toolVersion, args.ConfigPath, sig); err != nil { if errors.Is(err, orchestrator.ErrRestoreAborted) || errors.Is(err, orchestrator.ErrDecryptAborted) { - logging.Info("Restore workflow aborted by user") + logging.Warning("Restore workflow aborted by user") if args.Support { pendingSupportStats = support.BuildSupportStats(logger, resolveHostname(), envInfo.Type, envInfo.Version, toolVersion, startTime, time.Now(), exitCodeInterrupted, "restore") } @@ -1382,6 +1391,102 @@ func run() int { return finalExitCode } +const rollbackCountdownDisplayDuration = 10 * time.Second + +func printNetworkRollbackCountdown(abortInfo *orchestrator.RestoreAbortInfo) { + if abortInfo == nil { + return + } + + color := "\033[33m" // yellow + colorReset := "\033[0m" + + markerExists := false + if strings.TrimSpace(abortInfo.NetworkRollbackMarker) != "" { + if _, err := os.Stat(strings.TrimSpace(abortInfo.NetworkRollbackMarker)); err == nil { + markerExists = true + } + } + + status := "UNKNOWN" + switch { + case markerExists: + status = "ARMED (will execute automatically)" + case !abortInfo.RollbackDeadline.IsZero() && time.Now().After(abortInfo.RollbackDeadline): + status = "EXECUTED (marker removed)" + case strings.TrimSpace(abortInfo.NetworkRollbackMarker) != "": + status = "DISARMED/CLEARED (marker removed before deadline)" + case abortInfo.NetworkRollbackArmed: + status = "ARMED (status from snapshot)" + default: + status = "NOT ARMED" + } + + fmt.Println() + fmt.Printf("%s===========================================\n", color) + fmt.Printf("NETWORK ROLLBACK%s\n", colorReset) + fmt.Println() + + // Static info + fmt.Printf(" Status: %s\n", status) + if strings.TrimSpace(abortInfo.OriginalIP) != "" && abortInfo.OriginalIP != "unknown" { + fmt.Printf(" Pre-apply IP (from snapshot): %s\n", strings.TrimSpace(abortInfo.OriginalIP)) + } + if strings.TrimSpace(abortInfo.CurrentIP) != "" && abortInfo.CurrentIP != "unknown" { + fmt.Printf(" Post-apply IP (observed): %s\n", strings.TrimSpace(abortInfo.CurrentIP)) + } + if strings.TrimSpace(abortInfo.NetworkRollbackLog) != "" { + fmt.Printf(" Rollback log: %s\n", strings.TrimSpace(abortInfo.NetworkRollbackLog)) + } + fmt.Println() + + switch { + case markerExists && !abortInfo.RollbackDeadline.IsZero() && time.Until(abortInfo.RollbackDeadline) > 0: + fmt.Println("Connection will be temporarily interrupted during restore.") + if strings.TrimSpace(abortInfo.OriginalIP) != "" && abortInfo.OriginalIP != "unknown" { + fmt.Printf("Remember to reconnect using the pre-apply IP: %s\n", strings.TrimSpace(abortInfo.OriginalIP)) + } + case !markerExists && !abortInfo.RollbackDeadline.IsZero() && time.Now().After(abortInfo.RollbackDeadline): + if strings.TrimSpace(abortInfo.OriginalIP) != "" && abortInfo.OriginalIP != "unknown" { + fmt.Printf("Rollback executed: reconnect using the pre-apply IP: %s\n", strings.TrimSpace(abortInfo.OriginalIP)) + } + case !markerExists && strings.TrimSpace(abortInfo.NetworkRollbackMarker) != "": + if strings.TrimSpace(abortInfo.CurrentIP) != "" && abortInfo.CurrentIP != "unknown" { + fmt.Printf("Rollback will NOT run: reconnect using the post-apply IP: %s\n", strings.TrimSpace(abortInfo.CurrentIP)) + } + } + + // Live countdown for max 10 seconds (only when rollback is still armed). + if !markerExists || abortInfo.RollbackDeadline.IsZero() { + fmt.Printf("%s===========================================%s\n", color, colorReset) + return + } + + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + displayEnd := time.Now().Add(rollbackCountdownDisplayDuration) + + for { + remaining := time.Until(abortInfo.RollbackDeadline) + if remaining <= 0 { + fmt.Printf("\r Remaining: Rollback executing now... \n") + break + } + if time.Now().After(displayEnd) { + fmt.Printf("\r Remaining: %ds (exiting, rollback will proceed)\n", int(remaining.Seconds())) + break + } + fmt.Printf("\r Remaining: %ds ", int(remaining.Seconds())) + + select { + case <-ticker.C: + continue + } + } + + fmt.Printf("%s===========================================%s\n", color, colorReset) +} + func printFinalSummary(finalExitCode int) { fmt.Println() diff --git a/docs/BACKUP_ENV_MAPPING.md b/docs/BACKUP_ENV_MAPPING.md index a8c3e17..798bdcb 100644 --- a/docs/BACKUP_ENV_MAPPING.md +++ b/docs/BACKUP_ENV_MAPPING.md @@ -18,7 +18,6 @@ This file documents the mapping between variables from the **old Bash `backup.en AUTO_UPDATE_HASHES = SAME BACKUP_BLACKLIST = SAME BACKUP_CEPH_CONFIG = SAME -BACKUP_CLUSTER_CONFIG = SAME BACKUP_CRITICAL_FILES = SAME BACKUP_INSTALLED_PACKAGES = SAME BACKUP_PVE_BACKUP_FILES = SAME @@ -120,6 +119,11 @@ CLOUD_BACKUP_PATH = SEMANTIC CHANGE → CLOUD_REMOTE_PATH ⚠️ **Go**: `CLOUD_REMOTE="GoogleDrive"` (remote name) + `CLOUD_REMOTE_PATH="/proxsave/backup"` (path within that remote) **Migration**: remove the remote name from the bash variable and move the path into `CLOUD_REMOTE_PATH` +BACKUP_CLUSTER_CONFIG = SEMANTIC CHANGE ⚠️ + **Bash/legacy expectation**: skip only cluster config files. + **Go (current)**: also disables cluster runtime collection (`pvecm status`, `pvecm nodes`, HA status). + **Impact**: if you previously disabled cluster file backup but still wanted runtime status collection, re-enable `BACKUP_CLUSTER_CONFIG` (or introduce a separate runtime flag if needed). + STORAGE_WARNING_THRESHOLD_SECONDARY = SEMANTIC CHANGE → MIN_DISK_SPACE_SECONDARY_GB ⚠️ **Bash**: `STORAGE_WARNING_THRESHOLD_SECONDARY="90"` (90% used = warning) **Go**: `MIN_DISK_SPACE_SECONDARY_GB="10"` (minimum 10GB free required) diff --git a/docs/CLOUD_STORAGE.md b/docs/CLOUD_STORAGE.md index 8b1800a..7707fc8 100644 --- a/docs/CLOUD_STORAGE.md +++ b/docs/CLOUD_STORAGE.md @@ -339,7 +339,7 @@ This is sufficient to start! Other options use sensible defaults. # Cloud storage CLOUD_ENABLED=true CLOUD_REMOTE=GoogleDrive -CLOUD_REMOTE_PATH=/proxsave/backup # Complete folder path inside the remote +CLOUD_REMOTE_PATH=/proxsave/backup # Folder path inside the remote CLOUD_LOG_PATH=/proxsave/log # Optional: log folder inside the same remote # Upload mode @@ -377,13 +377,13 @@ RETENTION_YEARLY=3 |----------|---------|-------------| | `CLOUD_ENABLED` | `false` | Enable cloud storage | | `CLOUD_REMOTE` | _(required)_ | rclone remote **name** from `rclone config` (legacy `remote:path` still supported) | -| `CLOUD_REMOTE_PATH` | _(empty)_ | Full folder path/prefix inside the remote (e.g., `/proxsave/backup`) | -| `CLOUD_LOG_PATH` | _(empty)_ | Optional log folder on the same remote (set `remote:/path` only when using a different remote) | +| `CLOUD_REMOTE_PATH` | _(empty)_ | Folder path/prefix inside the remote (e.g., `/proxsave/backup`) | +| `CLOUD_LOG_PATH` | _(empty)_ | Optional log folder (recommended: path-only on the same remote; use `otherremote:/path` only when using a different remote) | | `CLOUD_UPLOAD_MODE` | `parallel` | `parallel` or `sequential` | | `CLOUD_PARALLEL_MAX_JOBS` | `2` | Max concurrent uploads (parallel mode) | | `CLOUD_PARALLEL_VERIFICATION` | `true` | Verify checksums after upload | | `CLOUD_WRITE_HEALTHCHECK` | `false` | Use write test for connectivity check | -| `RCLONE_TIMEOUT_CONNECTION` | `30` | Connection timeout (seconds) | +| `RCLONE_TIMEOUT_CONNECTION` | `30` | Connection timeout (seconds). Also used by restore/decrypt when scanning cloud backups (list + manifest read). | | `RCLONE_TIMEOUT_OPERATION` | `300` | Operation timeout (seconds) | | `RCLONE_BANDWIDTH_LIMIT` | _(empty)_ | Upload rate limit (e.g., `5M` = 5 MB/s) | | `RCLONE_TRANSFERS` | `4` | Number of parallel transfers | @@ -395,6 +395,29 @@ RETENTION_YEARLY=3 For complete configuration reference, see: **[Configuration Guide](CONFIGURATION.md)** +### Recommended Remote Path Formats (Important) + +ProxSave supports both “new style” (path-only) and “legacy style” (`remote:path`) values, but using a consistent format avoids confusion. + +**Recommended:** +- `CLOUD_REMOTE` should be just the **remote name** (no `:`), e.g. `nextcloud` or `GoogleDrive`. +- `CLOUD_REMOTE_PATH` should be a **path inside the remote** (no remote prefix). Use **no trailing slash**. A leading `/` is accepted. +- `CLOUD_LOG_PATH` should be a **folder path** for logs. When logs are stored on the **same remote**, prefer **path-only** here too (no remote prefix). Use `otherremote:/path` only if logs must go to a different remote than `CLOUD_REMOTE`. + +**Examples (same remote):** +```bash +CLOUD_REMOTE=nextcloud-katerasrael +CLOUD_REMOTE_PATH=B+K/BACKUP/marcellus +CLOUD_LOG_PATH=B+K/BACKUP/marcellus/logs +``` + +**Examples (different remotes for backups vs logs):** +```bash +CLOUD_REMOTE=nextcloud-backups +CLOUD_REMOTE_PATH=proxsave/backup/host1 +CLOUD_LOG_PATH=nextcloud-logs:proxsave/log/host1 +``` + ### Understanding CLOUD_REMOTE vs CLOUD_REMOTE_PATH **How CLOUD_REMOTE and CLOUD_REMOTE_PATH work together** @@ -414,6 +437,7 @@ path inside the remote, and uses that consistently for: - **uploads** (cloud backend); - **cloud retention**; - **restore / decrypt menus** (entry “Cloud backups (rclone)”). + - Restore/decrypt cloud scanning is protected by `RCLONE_TIMEOUT_CONNECTION` (increase it on slow remotes or very large directories). You can choose the style you prefer; they are equivalent from the tool’s point of view. @@ -593,6 +617,7 @@ rm /tmp/test*.txt | `couldn't find configuration section 'gdrive'` | Remote not configured | `rclone config` → create remote | | `401 unauthorized` | Credentials expired | `rclone config reconnect gdrive` or regenerate keys | | `connection timeout (30s)` | Slow network | Increase `RCLONE_TIMEOUT_CONNECTION=60` | +| `Timed out while scanning ... (rclone)` | Slow remote / huge directory | Increase `RCLONE_TIMEOUT_CONNECTION` and re-try restore/decrypt scan | | `operation timeout (300s exceeded)` | Large file + slow network | Increase `RCLONE_TIMEOUT_OPERATION=900` | | `429 Too Many Requests` | API rate limiting | Reduce `RCLONE_TRANSFERS=2`, increase `CLOUD_BATCH_PAUSE=3` | | `directory not found` | Path doesn't exist | `rclone mkdir gdrive:pbs-backups` | diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index eb0d295..f17a209 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -425,7 +425,11 @@ CLOUD_ENABLED=false # true | false CLOUD_REMOTE=GoogleDrive # remote name from `rclone config` CLOUD_REMOTE_PATH=/proxsave/backup # folder path inside the remote -# Cloud log path (same remote, optional) +# Cloud log path (optional) +# Recommended (same remote): path-only (no remote prefix) and no trailing slash +# CLOUD_LOG_PATH=proxsave/log +# Legacy / different remote: explicit remote:path +# CLOUD_LOG_PATH=OtherRemote:proxsave/log CLOUD_LOG_PATH=/proxsave/log # leave empty to disable log uploads # Legacy compatibility (still supported): @@ -446,6 +450,20 @@ CLOUD_PARALLEL_VERIFICATION=true # true | false CLOUD_WRITE_HEALTHCHECK=false # true | false (auto-fallback mode vs force write test) ``` +### Recommended Remote Path Formats (Cloud) + +To avoid ambiguity, prefer consistent formats: +- `CLOUD_REMOTE`: remote **name** only (no `:`), e.g. `nextcloud` or `GoogleDrive`. +- `CLOUD_REMOTE_PATH`: path **inside** the remote (no remote prefix), **no trailing slash** (leading `/` is accepted). +- `CLOUD_LOG_PATH`: log **folder path**. When logs are on the same remote as backups, prefer **path-only** here too; use `otherremote:/path` only when logs must go to a different remote. + +Example (same remote for backups + logs): +```bash +CLOUD_REMOTE=nextcloud-katerasrael +CLOUD_REMOTE_PATH=B+K/BACKUP/marcellus +CLOUD_LOG_PATH=B+K/BACKUP/marcellus/logs +``` + ### Connectivity Check Modes | Setting | Mode | Behavior | @@ -530,7 +548,7 @@ Quick comparison to help you choose the right storage configuration: ```bash # Connection timeout (seconds) -RCLONE_TIMEOUT_CONNECTION=30 # Remote accessibility check +RCLONE_TIMEOUT_CONNECTION=30 # Remote accessibility check (also used for restore/decrypt cloud scan) # Operation timeout (seconds) RCLONE_TIMEOUT_OPERATION=300 # Upload/download operations (5 minutes default) @@ -553,7 +571,7 @@ RCLONE_FLAGS="--checkers=4 --stats=0 --drive-use-trash=false --drive-pacer-min-s ### Timeout Tuning -- **CONNECTION**: Short timeout for quick accessibility check (default 30s) +- **CONNECTION**: Short timeout for quick accessibility check (default 30s); also caps restore/decrypt cloud scanning (listing backups + reading manifests) - **OPERATION**: Long timeout for large file uploads (increase for slow networks) ### Bandwidth Limit Format @@ -868,7 +886,7 @@ METRICS_PATH=${BASE_DIR}/metrics # Empty = /var/lib/prometheus/node-exporter ```bash # Cluster configuration -BACKUP_CLUSTER_CONFIG=true # /etc/pve/cluster files +BACKUP_CLUSTER_CONFIG=true # Cluster config + runtime (corosync, pvecm status/nodes, HA status) # PVE firewall rules BACKUP_PVE_FIREWALL=true # PVE firewall configuration diff --git a/docs/EXAMPLES.md b/docs/EXAMPLES.md index c104a2c..0a53278 100644 --- a/docs/EXAMPLES.md +++ b/docs/EXAMPLES.md @@ -853,39 +853,39 @@ CLOUD_LOG_PATH= ## Example 9: Test in a Chroot/Fixture -**Scenario**: Esegui la raccolta su un root alternativo (chroot, snapshot montato, fixture di test) senza toccare il filesystem live. +**Scenario**: Run collection against an alternate system root (chroot, mounted snapshot, test fixture) without touching the live filesystem. **Use case**: -- CI/test di backup in ambiente isolato -- Analisi offline di un'immagine/snapshot montata -- Esecuzione in container che monta un root diverso +- CI/test backups in an isolated environment +- Offline analysis of a mounted image/snapshot +- Running inside a container that mounts a different root ### Configuration ```bash # configs/backup.env -SYSTEM_ROOT_PREFIX=/mnt/snapshot-root # punta al root alternativo +SYSTEM_ROOT_PREFIX=/mnt/snapshot-root # points to the alternate root BACKUP_ENABLED=true ENABLE_GO_BACKUP=true -# /etc, /var, /root, /home vengono risolti sotto il prefisso +# /etc, /var, /root, /home are resolved under the prefix ``` ### Setup Steps ```bash -# 1) Monta o prepara il root alternativo -mount /dev/vg0/snap /mnt/snapshot-root # esempio +# 1) Mount or prepare the alternate root +mount /dev/vg0/snap /mnt/snapshot-root # example -# 2) Esegui un dry-run +# 2) Run a dry-run SYSTEM_ROOT_PREFIX=/mnt/snapshot-root ./build/proxsave --dry-run -# 3) Esegui il backup reale (opzionale) +# 3) Run the actual backup (optional) SYSTEM_ROOT_PREFIX=/mnt/snapshot-root ./build/proxsave ``` ### Expected Results -- I file raccolti riflettono il contenuto di `/mnt/snapshot-root/etc`, `/var`, `/root`, `/home`, ecc. -- Nessuna scrittura sul filesystem live del nodo. +- Collected files reflect the contents of `/mnt/snapshot-root/etc`, `/var`, `/root`, `/home`, etc. +- No writes to the node's live filesystem. --- diff --git a/docs/RESTORE_GUIDE.md b/docs/RESTORE_GUIDE.md index d35414b..cffac04 100644 --- a/docs/RESTORE_GUIDE.md +++ b/docs/RESTORE_GUIDE.md @@ -1642,6 +1642,14 @@ Type "RESTORE" (exact case) to proceed, or "cancel"/"0" to abort: _ - Case-sensitive - Prevents accidental restoration +**Prompt timeouts (auto-skip)**: +- Some interactive prompts include a visible countdown (currently **90 seconds**) to avoid getting “stuck” waiting for input in remote/automated scenarios. +- If the user does not answer before the countdown reaches 0, ProxSave proceeds with a **safe default** (no destructive action) and logs the decision. + +Current auto-skip prompts: +- **Smart `/etc/fstab` merge**: defaults to **Skip** (no changes). +- **Live network apply** (“Apply restored network configuration now…”): defaults to **No** (stays staged/on-disk only; no live reload). + ### 3. Compatibility Validation **System Type Detection**: @@ -1662,6 +1670,9 @@ Type "yes" to continue anyway or "no" to abort: _ If the **network** category is restored, ProxSave can optionally apply the new network configuration immediately using a **transactional rollback timer**. +**Apply prompt auto-skip**: +- The “apply now” prompt includes a **90-second** countdown; if you do not answer in time, ProxSave defaults to **No** and skips the live reload. + **Important (console recommended)**: - Run the live network apply/commit step from the **local console** (physical console, IPMI/iDRAC/iLO, Proxmox console, or hypervisor console), not from SSH. - If the restored network config changes the management IP or routes, your SSH session will drop and you may be unable to type `COMMIT`. @@ -1704,7 +1715,90 @@ This protects SSH/GUI access during network changes. **Result reporting**: - If you do not type `COMMIT`, ProxSave completes the restore with warnings and reports that the original network settings were restored (including the current IP, when detectable), plus the rollback log path. -### 4. Hard Guards +#### Ctrl+C footer: `NETWORK ROLLBACK` status + +If you interrupt ProxSave with **Ctrl+C** and a live network apply/rollback timer was involved, the CLI footer can print a `NETWORK ROLLBACK` block with a recommended reconnection IP and the rollback log path. + +The status can be one of: +- **ARMED**: rollback is still pending and will execute automatically at the deadline (a short countdown may be shown). +- **DISARMED/CLEARED**: rollback will **not** run (the marker was removed before the deadline; this can happen if it was manually cleared/disarmed). +- **EXECUTED**: rollback already ran (marker removed after the deadline). + +**Which IP should I use?** +- **ARMED**: prepare to reconnect using the **pre-apply IP** once rollback runs. +- **EXECUTED**: reconnect using the **pre-apply IP** (the system should be back on the previous network config). +- **DISARMED/CLEARED**: reconnect using the **post-apply IP** (the applied config remains active). + +Notes: +- *Pre-apply IP* is derived from the `before.txt` snapshot in `/tmp/proxsave/network_apply_*` and may be `unknown` if it cannot be parsed. +- *Post-apply IP* is what ProxSave could observe on the management interface after applying the new config; it may include CIDR suffixes (for example `10.0.0.4/24`) or multiple addresses. + +**Example outputs** + +The `NETWORK ROLLBACK` block is printed just before the standard ProxSave footer (the footer color reflects the exit status, e.g. magenta on Ctrl+C). + +Example 1 — **ARMED** (rollback pending, countdown shown for a few seconds): +```text +=========================================== +NETWORK ROLLBACK + + Status: ARMED (will execute automatically) + Pre-apply IP (from snapshot): 192.168.1.100 + Post-apply IP (observed): 10.0.0.4/24 + Rollback log: /tmp/proxsave/network_rollback_20260122_153012.log + +Connection will be temporarily interrupted during restore. +Remember to reconnect using the pre-apply IP: 192.168.1.100 + Remaining: 147s +=========================================== + +=========================================== +ProxSave - Go - +=========================================== +``` + +Example 2 — **EXECUTED** (rollback already ran, no countdown): +```text +=========================================== +NETWORK ROLLBACK + + Status: EXECUTED (marker removed) + Pre-apply IP (from snapshot): 192.168.1.100 + Post-apply IP (observed): 10.0.0.4/24 + Rollback log: /tmp/proxsave/network_rollback_20260122_153012.log + +Rollback executed: reconnect using the pre-apply IP: 192.168.1.100 +=========================================== +``` + +Example 3 — **DISARMED/CLEARED** (rollback will not run, applied config remains active): +```text +=========================================== +NETWORK ROLLBACK + + Status: DISARMED/CLEARED (marker removed before deadline) + Pre-apply IP (from snapshot): 192.168.1.100 + Post-apply IP (observed): 10.0.0.4/24 + Rollback log: /tmp/proxsave/network_rollback_20260122_153012.log + +Rollback will NOT run: reconnect using the post-apply IP: 10.0.0.4/24 +=========================================== +``` + +### 5. Smart `/etc/fstab` Merge (Optional) + +If the restore includes filesystem configuration (notably `/etc/fstab`), ProxSave can run a **smart merge** instead of blindly overwriting your current `fstab`. + +**What it does**: +- Compares the current `/etc/fstab` with the backup copy. +- Keeps existing critical entries (for example, root and swap) when they already match the running system. +- Detects **safe mount candidates** from the backup (for example, additional NFS mounts) and offers to add them. + +**Safety behavior**: +- The user is prompted before any change is written. +- The prompt includes a **90-second** countdown; if you do not answer in time, ProxSave defaults to **Skip** (no changes). + +### 6. Hard Guards **Path Traversal Prevention**: - All extracted paths validated @@ -1721,7 +1815,7 @@ if cleanDestRoot == "/" && strings.HasPrefix(target, "/etc/pve") { ``` - Absolute prevention of `/etc/pve` corruption -### 5. Service Management Fail-Fast +### 7. Service Management Fail-Fast **Service Stop**: If ANY service fails to stop → ABORT entire restore @@ -1730,7 +1824,7 @@ if cleanDestRoot == "/" && strings.HasPrefix(target, "/etc/pve") { - Better to fail safely than corrupt database - User can investigate and retry -### 6. Comprehensive Logging +### 8. Comprehensive Logging **Detailed Log**: `/tmp/proxsave/restore_YYYYMMDD_HHMMSS.log` @@ -1768,7 +1862,7 @@ grep "storage.cfg" /tmp/proxsave/restore_20251120_143052.log grep "FAILED" /tmp/proxsave/restore_20251120_143052.log ``` -### 7. Checksum Verification +### 9. Checksum Verification **SHA256 Verification**: - Backup checksum verified after decryption @@ -1783,7 +1877,7 @@ Actual: a1b2c3... ✓ Checksum verified successfully. ``` -### 8. Deferred Service Restart +### 10. Deferred Service Restart **Go defer pattern** ensures services restart even if restore fails: @@ -2470,24 +2564,25 @@ A: Yes, in two ways: CLOUD_REMOTE_PATH=/pbs-backups/server1 ``` - - During `--decrypt` or `--restore` (CLI o TUI), ProxSave leggerà la stessa - combinazione `CLOUD_REMOTE` / `CLOUD_REMOTE_PATH` e mostrerà una voce: + - During `--decrypt` or `--restore` (CLI or TUI), ProxSave will read the same + `CLOUD_REMOTE` / `CLOUD_REMOTE_PATH` combination and show an entry: - `Cloud backups (rclone)` - - Selezionandola, il tool: - - elenca i bundle `.bundle.tar` sul remote con `rclone lsf`; - - legge i metadata/manifest tramite `rclone cat` (senza scaricare tutto); - - quando scegli un backup, lo scarica in `/tmp/proxsave` e procede al decrypt/restore. + - When selected, the tool: + - lists `.bundle.tar` bundles on the remote with `rclone lsf`; + - reads metadata/manifest via `rclone cat` (without downloading everything); + - when you pick a backup, downloads it to `/tmp/proxsave` and proceeds with decrypt/restore. + - If scanning times out (slow remote / huge directory), increase `RCLONE_TIMEOUT_CONNECTION` and retry. -2. **Da mount locale rclone (solo restore)** - Se preferisci montare il backend rclone come filesystem locale: +2. **From a local rclone mount (restore-only)** + If you prefer to mount the rclone backend as a local filesystem: ```bash # Mount cloud storage locally rclone mount remote:bucket /mnt/cloud & # Configure in backup.env (restore-only scenario) - CLOUD_ENABLED=false # cloud upload disabilitato - # Usa BACKUP_PATH / SECONDARY_PATH oppure naviga il mount direttamente + CLOUD_ENABLED=false # cloud upload disabled + # Use BACKUP_PATH / SECONDARY_PATH or browse the mount directly ``` In questo caso puoi: diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 740d962..67905f2 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -277,6 +277,28 @@ iptables -L -n | grep -i drop --- +#### Restore/Decrypt: stuck on “Scanning backup path…” or timeout (cloud/rclone) + +**Cause**: The tool scans cloud backups by listing the remote (`rclone lsf`) and reading each backup manifest (`rclone cat`). On slow remotes or very large directories this can time out. + +**Solution**: +```bash +# Increase scan timeout +nano configs/backup.env +RCLONE_TIMEOUT_CONNECTION=120 + +# Re-run restore with debug logs (restore log path is printed on start) +./build/proxsave --restore --log-level debug +``` + +If it still fails, run the equivalent manual checks: +```bash +rclone lsf +rclone cat /.bundle.tar | head +``` + +--- + #### Error: `operation timeout (300s exceeded)` **Cause**: Large backup file + slow upload speed. @@ -299,6 +321,8 @@ BACKUP_CLUSTER_CONFIG=false BACKUP_ROOT_HOME=false ``` +> Note: `BACKUP_CLUSTER_CONFIG=false` also skips cluster runtime collection (`pvecm status`, `pvecm nodes`, HA status), which helps avoid non-critical cluster warnings on standalone nodes. + --- #### Error: `429 Too Many Requests` (API rate limiting) @@ -553,6 +577,23 @@ MIN_DISK_SPACE_PRIMARY_GB=5 # Lower threshold --- ### 7. Restore Issues +#### Restore drops SSH / IP changes during network restore + +**Symptoms**: +- SSH/Web UI disconnects during restore when the `network` category is applied live +- You see a `NETWORK ROLLBACK` block in the footer (especially after Ctrl+C) + +**Explanation**: +- Live network apply can change IP/routes immediately. +- ProxSave protects access by arming a rollback timer that can revert network-related files automatically if `COMMIT` is not received in time. + +**What to do**: +- Prefer running restore from the **local console/IPMI**, not over SSH. +- If the footer says **ARMED**, reconnect using the **pre-apply IP** once rollback runs. +- If it says **EXECUTED**, reconnect using the **pre-apply IP** (rollback already ran). +- If it says **DISARMED/CLEARED**, reconnect using the **post-apply IP** (new config remains active). +- Check the rollback log path printed in the footer for details. + #### Error during network preflight: `addr_add_dry_run() got an unexpected keyword argument 'nodad'` **Symptoms**: diff --git a/internal/backup/collector.go b/internal/backup/collector.go index a209020..1f62ed5 100644 --- a/internal/backup/collector.go +++ b/internal/backup/collector.go @@ -828,7 +828,7 @@ func (c *Collector) safeCmdOutput(ctx context.Context, cmd, output, description c.incFilesFailed() return fmt.Errorf("critical command `%s` failed for %s: %w (output: %s)", cmdString, description, err, summarizeCommandOutputText(string(out))) } - c.logger.Warning("Skipping %s: command `%s` failed (%v). Non-critical; backup continues. Ensure the PBS CLI is available and has proper permissions. Output: %s", + c.logger.Warning("Skipping %s: command `%s` failed (%v). Non-critical; backup continues. Ensure the required CLI is available and has proper permissions. Output: %s", description, cmdString, err, diff --git a/internal/backup/collector_pve.go b/internal/backup/collector_pve.go index 6324816..2a56e8b 100644 --- a/internal/backup/collector_pve.go +++ b/internal/backup/collector_pve.go @@ -346,7 +346,7 @@ func (c *Collector) collectPVECommands(ctx context.Context, clustered bool) (*pv } // Cluster commands (if clustered) - if clustered { + if clustered && c.config.BackupClusterConfig { c.safeCmdOutput(ctx, "pvecm status", filepath.Join(commandsDir, "cluster_status.txt"), @@ -365,6 +365,8 @@ func (c *Collector) collectPVECommands(ctx context.Context, clustered bool) (*pv filepath.Join(commandsDir, "ha_status.json"), "HA status", false) + } else if clustered && !c.config.BackupClusterConfig { + c.logger.Debug("Skipping cluster runtime commands: BACKUP_CLUSTER_CONFIG=false (clustered=%v)", clustered) } // Storage status @@ -1651,25 +1653,53 @@ func (c *Collector) isClusteredPVE(ctx context.Context) (bool, error) { return false, err } + corosyncPath := c.effectiveCorosyncConfigPath() + corosyncExists := false + if corosyncPath != "" { + if _, err := c.depStat(corosyncPath); err == nil { + corosyncExists = true + } + } + c.logger.Debug("Cluster detection: corosyncPath=%q exists=%v", corosyncPath, corosyncExists) + if c.hasCorosyncClusterConfig() { c.logger.Debug("Detected cluster via corosync configuration") return true, nil } - if c.hasMultiplePVENodes() { - c.logger.Debug("Detected cluster via nodes directory count") - return true, nil + nodeCount, nodeErr := c.pveNodesDirCount() + if nodeErr != nil { + c.logger.Debug("Cluster detection: nodes directory count unavailable: %v", nodeErr) + } else { + c.logger.Debug("Cluster detection: nodes directory count=%d", nodeCount) + if nodeCount > 1 { + if corosyncExists { + c.logger.Debug("Detected cluster via nodes directory count (corosync config present)") + return true, nil + } + c.logger.Debug("Cluster detection: nodes directory suggests cluster, but corosync config missing; deferring to service/pvecm checks") + } } - if c.isServiceActive(ctx, "corosync.service") { - c.logger.Debug("Detected cluster via corosync.service state") - return true, nil + corosyncActive := c.isServiceActive(ctx, "corosync.service") + c.logger.Debug("Cluster detection: corosync.service active=%v", corosyncActive) + if corosyncActive { + if corosyncExists { + c.logger.Debug("Detected cluster via corosync.service state (corosync config present)") + return true, nil + } + c.logger.Debug("Cluster detection: corosync.service active but corosync config missing; deferring to pvecm check") } // Fallback to pvecm status if _, err := c.depLookPath("pvecm"); err == nil { output, err := c.depRunCommand(ctx, "pvecm", "status") if err != nil { + outText := strings.TrimSpace(string(output)) + if isPvecmMissingCorosyncConfig(outText) { + c.logger.Debug("Cluster detection: pvecm status indicates no corosync configuration; treating as standalone. Output: %s", summarizeCommandOutputText(outText)) + return false, nil + } return false, fmt.Errorf("pvecm status failed: %w", err) } clustered := strings.Contains(string(output), "Cluster information") @@ -1691,12 +1721,7 @@ func shortHostname(host string) string { } func (c *Collector) hasCorosyncClusterConfig() bool { - corosyncPath := c.config.CorosyncConfigPath - if corosyncPath == "" { - corosyncPath = filepath.Join(c.effectivePVEConfigPath(), "corosync.conf") - } else if !filepath.IsAbs(corosyncPath) { - corosyncPath = filepath.Join(c.effectivePVEConfigPath(), corosyncPath) - } + corosyncPath := c.effectiveCorosyncConfigPath() data, err := os.ReadFile(corosyncPath) if err != nil { return false @@ -1710,22 +1735,43 @@ func (c *Collector) hasCorosyncClusterConfig() bool { return false } +func (c *Collector) effectiveCorosyncConfigPath() string { + corosyncPath := strings.TrimSpace(c.config.CorosyncConfigPath) + if corosyncPath == "" { + return filepath.Join(c.effectivePVEConfigPath(), "corosync.conf") + } + if filepath.IsAbs(corosyncPath) { + return corosyncPath + } + return filepath.Join(c.effectivePVEConfigPath(), corosyncPath) +} + func (c *Collector) hasMultiplePVENodes() bool { + count, err := c.pveNodesDirCount() + return err == nil && count > 1 +} + +func (c *Collector) pveNodesDirCount() (int, error) { nodesDir := filepath.Join(c.effectivePVEConfigPath(), "nodes") entries, err := os.ReadDir(nodesDir) if err != nil { - return false + return 0, err } count := 0 for _, entry := range entries { if entry.IsDir() { count++ - if count > 1 { - return true - } } } - return false + return count, nil +} + +func isPvecmMissingCorosyncConfig(output string) bool { + if output == "" { + return false + } + lower := strings.ToLower(output) + return strings.Contains(lower, "corosync config") && strings.Contains(lower, "does not exist") } func (c *Collector) isServiceActive(ctx context.Context, service string) bool { diff --git a/internal/backup/collector_pve_test.go b/internal/backup/collector_pve_test.go index 4c3c38a..4d3ea8f 100644 --- a/internal/backup/collector_pve_test.go +++ b/internal/backup/collector_pve_test.go @@ -49,6 +49,44 @@ func TestIsClusteredPVEFallbackToPvecm(t *testing.T) { } } +func TestIsClusteredPVEMissingCorosyncConfigIsStandalone(t *testing.T) { + collector := newPVECollectorWithDeps(t, CollectorDeps{ + LookPath: func(cmd string) (string, error) { + switch cmd { + case "pvecm": + return "/usr/bin/pvecm", nil + case "systemctl": + return "", fmt.Errorf("missing systemctl") + default: + return "", fmt.Errorf("unexpected lookPath for %s", cmd) + } + }, + RunCommand: func(ctx context.Context, name string, args ...string) ([]byte, error) { + if name != "pvecm" { + return nil, fmt.Errorf("unexpected command %s", name) + } + if len(args) != 1 || args[0] != "status" { + return nil, fmt.Errorf("unexpected args %v", args) + } + return []byte("Error: Corosync config '/etc/pve/corosync.conf' does not exist - is this node part of a cluster?\n"), fmt.Errorf("exit status 2") + }, + }) + collector.config.CorosyncConfigPath = "" + + nodesDir := filepath.Join(collector.config.PVEConfigPath, "nodes", "single") + if err := os.MkdirAll(nodesDir, 0o755); err != nil { + t.Fatalf("failed to create nodes dir: %v", err) + } + + clustered, err := collector.isClusteredPVE(context.Background()) + if err != nil { + t.Fatalf("isClusteredPVE returned error: %v", err) + } + if clustered { + t.Fatal("expected clustered=false when pvecm reports missing corosync config") + } +} + func TestIsServiceActive(t *testing.T) { t.Run("missing systemctl", func(t *testing.T) { collector := newPVECollectorWithDeps(t, CollectorDeps{ @@ -140,6 +178,29 @@ func TestCephStorageConfigured(t *testing.T) { }) } +func TestCollectPVECommandsSkipsClusterRuntimeWhenBackupClusterConfigDisabled(t *testing.T) { + pvecmCalls := 0 + collector := newPVECollectorWithDeps(t, CollectorDeps{ + LookPath: func(cmd string) (string, error) { + return "/usr/bin/" + cmd, nil + }, + RunCommand: func(ctx context.Context, name string, args ...string) ([]byte, error) { + if name == "pvecm" { + pvecmCalls++ + } + return []byte{}, nil + }, + }) + collector.config.BackupClusterConfig = false + + if _, err := collector.collectPVECommands(context.Background(), true); err != nil { + t.Fatalf("collectPVECommands returned error: %v", err) + } + if pvecmCalls != 0 { + t.Fatalf("expected pvecm not to be called when BACKUP_CLUSTER_CONFIG=false, got %d calls", pvecmCalls) + } +} + func TestCephStatusAvailable(t *testing.T) { t.Run("cli missing", func(t *testing.T) { collector := newPVECollectorWithDeps(t, CollectorDeps{ diff --git a/internal/backup/collector_pve_util_test.go b/internal/backup/collector_pve_util_test.go index 3e0951f..759d2f7 100644 --- a/internal/backup/collector_pve_util_test.go +++ b/internal/backup/collector_pve_util_test.go @@ -684,6 +684,9 @@ func TestIsClusteredPVE(t *testing.T) { if err := os.MkdirAll(filepath.Join(nodesDir, "pve2"), 0o755); err != nil { t.Fatal(err) } + if err := os.WriteFile(filepath.Join(pveDir, "corosync.conf"), []byte("nodelist {}\n"), 0o644); err != nil { + t.Fatal(err) + } logger := newTestLogger() cfg := GetDefaultCollectorConfig() @@ -696,7 +699,33 @@ func TestIsClusteredPVE(t *testing.T) { t.Fatalf("isClusteredPVE error: %v", err) } if !clustered { - t.Error("isClusteredPVE() should return true with multiple nodes") + t.Error("isClusteredPVE() should return true when multiple nodes exist and corosync config is present") + } + }) + + t.Run("does not detect via multiple nodes alone", func(t *testing.T) { + tmpDir := t.TempDir() + pveDir := filepath.Join(tmpDir, "etc", "pve") + nodesDir := filepath.Join(pveDir, "nodes") + if err := os.MkdirAll(filepath.Join(nodesDir, "pve1"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(filepath.Join(nodesDir, "pve2"), 0o755); err != nil { + t.Fatal(err) + } + + logger := newTestLogger() + cfg := GetDefaultCollectorConfig() + cfg.PVEConfigPath = pveDir + cfg.CorosyncConfigPath = "" + collector := NewCollector(logger, cfg, tmpDir, "pve", false) + + clustered, err := collector.isClusteredPVE(context.Background()) + if err != nil { + t.Fatalf("isClusteredPVE error: %v", err) + } + if clustered { + t.Error("isClusteredPVE() should return false when only multiple node directories exist without corosync config") } }) diff --git a/internal/config/templates/backup.env b/internal/config/templates/backup.env index b7837a1..17cef09 100644 --- a/internal/config/templates/backup.env +++ b/internal/config/templates/backup.env @@ -146,8 +146,9 @@ CLOUD_ENABLED=false # true-false = enable/disable cloud backup CLOUD_REMOTE=GoogleDrive # rclone remote NAME (from "rclone config") CLOUD_REMOTE_PATH=/proxsave/backup # Optional subdirectory within the remote # Cloud log storage path: -# - New style: path only (remote name is taken from CLOUD_REMOTE): /proxsave/log -# - Legacy style: full remote:path: myremote:/proxsave/log +# - Recommended (same remote): path-only (remote name is taken from CLOUD_REMOTE), no trailing slash: proxsave/log +# - New style (also ok): /proxsave/log +# - Legacy / different remote: explicit remote:path: myremote:proxsave/log (or myremote:/proxsave/log) CLOUD_LOG_PATH=/proxsave/log CLOUD_UPLOAD_MODE=parallel # sequential | parallel (parallel uses a worker pool for associated/bundle files) CLOUD_PARALLEL_MAX_JOBS=2 # Parallel workers when CLOUD_UPLOAD_MODE=parallel @@ -159,6 +160,7 @@ CLOUD_WRITE_HEALTHCHECK=false # false = auto (list + fallback to write te # ---------------------------------------------------------------------- # CONNECTION timeout: remote accessibility check (short) # OPERATION timeout: full upload/download operations (long) +# NOTE: The connection timeout is also used by restore/decrypt workflows when scanning cloud backups. RCLONE_TIMEOUT_CONNECTION=30 # seconds RCLONE_TIMEOUT_OPERATION=300 # seconds RCLONE_BANDWIDTH_LIMIT="10M" # e.g. "10M" for 10 MB/s, empty = unlimited @@ -268,6 +270,8 @@ METRICS_PATH=${BASE_DIR}/metrics # Collector options # ---------------------------------------------------------------------- # PVE +# BACKUP_CLUSTER_CONFIG controls both cluster config collection and cluster runtime commands +# (corosync.conf / pve-cluster data, pvecm status/nodes, and HA status collection). BACKUP_CLUSTER_CONFIG=true BACKUP_PVE_FIREWALL=true BACKUP_VZDUMP_CONFIG=true diff --git a/internal/input/input.go b/internal/input/input.go index 7dafd2d..500e43e 100644 --- a/internal/input/input.go +++ b/internal/input/input.go @@ -15,6 +15,15 @@ import ( // Callers should translate this into the appropriate workflow-level abort error. var ErrInputAborted = errors.New("input aborted") +// IsAborted reports whether an operation was aborted by the user (typically via Ctrl+C), +// by checking for ErrInputAborted and context cancellation. +func IsAborted(err error) bool { + if err == nil { + return false + } + return errors.Is(err, ErrInputAborted) || errors.Is(err, context.Canceled) +} + // MapInputError normalizes common stdin errors (EOF/closed fd) into ErrInputAborted. func MapInputError(err error) error { if err == nil { diff --git a/internal/input/input_test.go b/internal/input/input_test.go index 9369375..cad4217 100644 --- a/internal/input/input_test.go +++ b/internal/input/input_test.go @@ -39,6 +39,21 @@ func TestMapInputError(t *testing.T) { } } +func TestIsAborted(t *testing.T) { + if IsAborted(nil) { + t.Fatalf("expected false for nil") + } + if !IsAborted(ErrInputAborted) { + t.Fatalf("expected true for ErrInputAborted") + } + if !IsAborted(context.Canceled) { + t.Fatalf("expected true for context.Canceled") + } + if IsAborted(errors.New("other")) { + t.Fatalf("expected false for non-abort errors") + } +} + func TestReadLineWithContext_ReturnsLine(t *testing.T) { reader := bufio.NewReader(strings.NewReader("hello\n")) got, err := ReadLineWithContext(context.Background(), reader) @@ -145,4 +160,3 @@ func TestReadPasswordWithContext_CancelledReturnsAborted(t *testing.T) { t.Fatalf("err=%v; want %v", err, ErrInputAborted) } } - diff --git a/internal/logging/bootstrap.go b/internal/logging/bootstrap.go index f1baa96..49dc83a 100644 --- a/internal/logging/bootstrap.go +++ b/internal/logging/bootstrap.go @@ -15,8 +15,8 @@ type bootstrapEntry struct { raw bool } -// BootstrapLogger accumula i log generati prima che il logger principale sia -// inizializzato, così da poterli riversare nel log finale. +// BootstrapLogger accumulates logs generated before the main logger is initialized, +// so they can be flushed into the final log. type BootstrapLogger struct { mu sync.Mutex entries []bootstrapEntry @@ -25,35 +25,35 @@ type BootstrapLogger struct { mirror *Logger } -// NewBootstrapLogger crea un nuovo bootstrap logger con livello INFO di default. +// NewBootstrapLogger creates a new bootstrap logger with INFO level by default. func NewBootstrapLogger() *BootstrapLogger { return &BootstrapLogger{ minLevel: types.LogLevelInfo, } } -// SetLevel aggiorna il livello minimo che verrà usato al flush. +// SetLevel updates the minimum level used during Flush. func (b *BootstrapLogger) SetLevel(level types.LogLevel) { b.mu.Lock() defer b.mu.Unlock() b.minLevel = level } -// Println registra una riga “raw” (usata per banner/testo senza header). +// Println records a raw line (used for banners/text without a header). func (b *BootstrapLogger) Println(message string) { fmt.Println(message) b.mirrorLog(types.LogLevelInfo, message) b.recordRaw(message) } -// Debug registra un messaggio di debug senza stamparlo a console. +// Debug records a debug message without printing it to the console. func (b *BootstrapLogger) Debug(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) b.mirrorLog(types.LogLevelDebug, msg) b.record(types.LogLevelDebug, msg) } -// Printf registra una riga formattata come raw. +// Printf records a formatted line as raw. func (b *BootstrapLogger) Printf(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) fmt.Println(msg) @@ -69,7 +69,7 @@ func (b *BootstrapLogger) Info(format string, args ...interface{}) { b.record(types.LogLevelInfo, msg) } -// Warning registra un messaggio di warning precoce (stampato su stderr). +// Warning records an early warning message (printed to stderr). func (b *BootstrapLogger) Warning(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if !strings.HasSuffix(msg, "\n") { @@ -81,7 +81,7 @@ func (b *BootstrapLogger) Warning(format string, args ...interface{}) { b.record(types.LogLevelWarning, msg) } -// Error registra un messaggio di errore precoce (stderr). +// Error records an early error message (stderr). func (b *BootstrapLogger) Error(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if !strings.HasSuffix(msg, "\n") { @@ -112,7 +112,7 @@ func (b *BootstrapLogger) recordRaw(message string) { }) } -// Flush riversa le entry accumulate nel logger principale (solo la prima volta). +// Flush flushes accumulated entries into the main logger (only the first time). func (b *BootstrapLogger) Flush(logger *Logger) { b.mu.Lock() defer b.mu.Unlock() diff --git a/internal/logging/logger.go b/internal/logging/logger.go index c29c62b..5f000e2 100644 --- a/internal/logging/logger.go +++ b/internal/logging/logger.go @@ -10,20 +10,20 @@ import ( "github.com/tis24dev/proxsave/internal/types" ) -// Logger gestisce il logging dell'applicazione +// Logger handles application logging. type Logger struct { mu sync.Mutex level types.LogLevel useColor bool output io.Writer timeFormat string - logFile *os.File // File di log (opzionale) + logFile *os.File // Log file (optional) warningCount int64 errorCount int64 exitFunc func(int) } -// New crea un nuovo logger +// New creates a new logger. func New(level types.LogLevel, useColor bool) *Logger { return &Logger{ level: level, @@ -34,7 +34,7 @@ func New(level types.LogLevel, useColor bool) *Logger { } } -// SetOutput imposta l'output writer del logger +// SetOutput sets the logger output writer. func (l *Logger) SetOutput(w io.Writer) { l.mu.Lock() defer l.mu.Unlock() @@ -45,7 +45,7 @@ func (l *Logger) SetOutput(w io.Writer) { l.output = w } -// SetLevel imposta il livello di logging +// SetLevel sets the logging level. func (l *Logger) SetLevel(level types.LogLevel) { l.mu.Lock() defer l.mu.Unlock() @@ -64,17 +64,17 @@ func (l *Logger) SetExitFunc(fn func(int)) { l.exitFunc = fn } -// OpenLogFile apre un file di log e inizia la scrittura real-time +// OpenLogFile opens a log file and starts real-time writing. func (l *Logger) OpenLogFile(logPath string) error { l.mu.Lock() defer l.mu.Unlock() - // Se c'è già un file aperto, chiudilo prima + // If a log file is already open, close it first. if l.logFile != nil { l.logFile.Close() } - // Crea il file di log (O_CREATE|O_WRONLY|O_APPEND) - // O_SYNC forza la scrittura immediata su disco (real-time, nessun buffer) + // Create the log file (O_CREATE|O_WRONLY|O_APPEND). + // O_SYNC forces immediate writes to disk (real-time, no buffering). file, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0600) if err != nil { return fmt.Errorf("failed to open log file %s: %w", logPath, err) @@ -84,7 +84,7 @@ func (l *Logger) OpenLogFile(logPath string) error { return nil } -// CloseLogFile chiude il file di log (da chiamare dopo le notifiche) +// CloseLogFile closes the log file (call after notifications). func (l *Logger) CloseLogFile() error { l.mu.Lock() defer l.mu.Unlock() @@ -97,7 +97,7 @@ func (l *Logger) CloseLogFile() error { return err } -// GetLogFilePath restituisce il path del file di log attualmente aperto (o "" se nessuno) +// GetLogFilePath returns the path of the currently open log file (or "" if none). func (l *Logger) GetLogFilePath() string { l.mu.Lock() defer l.mu.Unlock() @@ -112,14 +112,14 @@ func (l *Logger) UsesColor() bool { return l.useColor } -// GetLevel restituisce il livello corrente +// GetLevel returns the current log level. func (l *Logger) GetLevel() types.LogLevel { l.mu.Lock() defer l.mu.Unlock() return l.level } -// log è il metodo interno per scrivere i log +// log is the internal method used to write logs. func (l *Logger) log(level types.LogLevel, format string, args ...interface{}) { l.logWithLabel(level, "", "", format, args...) } @@ -169,7 +169,7 @@ func (l *Logger) logWithLabel(level types.LogLevel, label string, colorOverride } } - // Formato per stdout (con colori se abilitati) + // Format for stdout (with colors if enabled). outputStdout := fmt.Sprintf("[%s] %s%-8s%s %s\n", timestamp, colorCode, @@ -178,17 +178,17 @@ func (l *Logger) logWithLabel(level types.LogLevel, label string, colorOverride message, ) - // Formato per file (senza colori) + // Format for file (without colors). outputFile := fmt.Sprintf("[%s] %-8s %s\n", timestamp, levelStr, message, ) - // Scrivi su stdout con colori + // Write to stdout with colors. fmt.Fprint(l.output, outputStdout) - // Se c'è un file di log, scrivi anche lì (senza colori) + // If a log file is open, write there too (without colors). if l.logFile != nil { fmt.Fprint(l.logFile, outputFile) } @@ -208,7 +208,7 @@ func (l *Logger) HasErrors() bool { return l.errorCount > 0 } -// Debug scrive un log di debug +// Debug writes a debug log. func (l *Logger) Debug(format string, args ...interface{}) { l.log(types.LogLevelDebug, format, args...) } @@ -254,17 +254,17 @@ func (l *Logger) Skip(format string, args ...interface{}) { l.logWithLabel(types.LogLevelInfo, "SKIP", colorOverride, format, args...) } -// Warning scrive un log di warning +// Warning writes a warning log. func (l *Logger) Warning(format string, args ...interface{}) { l.log(types.LogLevelWarning, format, args...) } -// Error scrive un log di errore +// Error writes an error log. func (l *Logger) Error(format string, args ...interface{}) { l.log(types.LogLevelError, format, args...) } -// Critical scrive un log critico +// Critical writes a critical log. func (l *Logger) Critical(format string, args ...interface{}) { l.log(types.LogLevelCritical, format, args...) } @@ -305,19 +305,19 @@ func init() { defaultLogger = New(types.LogLevelInfo, true) } -// SetDefaultLogger imposta il logger di default +// SetDefaultLogger sets the default logger. func SetDefaultLogger(logger *Logger) { defaultLogger = logger } -// GetDefaultLogger restituisce il logger di default +// GetDefaultLogger returns the default logger. func GetDefaultLogger() *Logger { return defaultLogger } // Package-level convenience functions -// Debug scrive un log di debug usando il logger di default +// Debug writes a debug log using the default logger. func Debug(format string, args ...interface{}) { defaultLogger.Debug(format, args...) } @@ -327,27 +327,27 @@ func Info(format string, args ...interface{}) { defaultLogger.Info(format, args...) } -// Step scrive un log STEP usando il logger di default +// Step writes a STEP log using the default logger. func Step(format string, args ...interface{}) { defaultLogger.Step(format, args...) } -// Skip scrive un log SKIP usando il logger di default +// Skip writes a SKIP log using the default logger. func Skip(format string, args ...interface{}) { defaultLogger.Skip(format, args...) } -// Warning scrive un log di warning usando il logger di default +// Warning writes a warning log using the default logger. func Warning(format string, args ...interface{}) { defaultLogger.Warning(format, args...) } -// Error scrive un log di errore usando il logger di default +// Error writes an error log using the default logger. func Error(format string, args ...interface{}) { defaultLogger.Error(format, args...) } -// Critical scrive un log critico usando il logger di default +// Critical writes a critical log using the default logger. func Critical(format string, args ...interface{}) { defaultLogger.Critical(format, args...) } diff --git a/internal/notify/telegram_registration.go b/internal/notify/telegram_registration.go index 9f19a4e..69a4fd3 100644 --- a/internal/notify/telegram_registration.go +++ b/internal/notify/telegram_registration.go @@ -29,14 +29,14 @@ func truncateTelegramRegistrationBody(body []byte, max int) string { return text } -// TelegramRegistrationStatus rappresenta l'esito dell'handshake con il server Telegram centralizzato. +// TelegramRegistrationStatus represents the result of the handshake with the centralized Telegram server. type TelegramRegistrationStatus struct { Code int Message string Error error } -// CheckTelegramRegistration verifica lo stato della registrazione sul server centralizzato. +// CheckTelegramRegistration checks the registration status on the centralized server. func CheckTelegramRegistration(ctx context.Context, serverAPIHost, serverID string, logger *logging.Logger) TelegramRegistrationStatus { logTelegramRegistrationDebug(logger, "Telegram registration: start (serverAPIHost=%q serverID=%q len=%d)", serverAPIHost, serverID, len(serverID)) @@ -44,7 +44,7 @@ func CheckTelegramRegistration(ctx context.Context, serverAPIHost, serverID stri logTelegramRegistrationDebug(logger, "Telegram registration: missing serverID (empty string)") return TelegramRegistrationStatus{ Code: 0, - Message: "SERVER_ID non disponibile", + Message: "SERVER_ID not available", Error: fmt.Errorf("server ID missing"), } } diff --git a/internal/orchestrator/.backup.lock b/internal/orchestrator/.backup.lock index 9a4c9c2..2a493a8 100644 --- a/internal/orchestrator/.backup.lock +++ b/internal/orchestrator/.backup.lock @@ -1,3 +1,3 @@ -pid=1706596 +pid=2367035 host=pve -time=2026-01-20T23:21:05+01:00 +time=2026-01-21T19:42:55+01:00 diff --git a/internal/orchestrator/backup_sources.go b/internal/orchestrator/backup_sources.go index a4669de..8642ae3 100644 --- a/internal/orchestrator/backup_sources.go +++ b/internal/orchestrator/backup_sources.go @@ -87,6 +87,7 @@ func buildDecryptPathOptions(cfg *config.Config, logger *logging.Logger) (option func discoverRcloneBackups(ctx context.Context, remotePath string, logger *logging.Logger) (candidates []*decryptCandidate, err error) { done := logging.DebugStart(logger, "discover rclone backups", "remote=%s", remotePath) defer func() { done(err) }() + start := time.Now() // Build full remote path - ensure it ends with ":" if it's just a remote name fullPath := strings.TrimSpace(remotePath) if !strings.Contains(fullPath, ":") { @@ -100,11 +101,12 @@ func discoverRcloneBackups(ctx context.Context, remotePath string, logger *loggi // Use rclone lsf to list files inside the backup directory cmd := exec.CommandContext(ctx, "rclone", "lsf", fullPath) + lsfStart := time.Now() output, err := cmd.CombinedOutput() if err != nil { return nil, fmt.Errorf("failed to list rclone remote %s: %w (output: %s)", fullPath, err, string(output)) } - logging.DebugStep(logger, "discover rclone backups", "rclone lsf output bytes=%d", len(output)) + logging.DebugStep(logger, "discover rclone backups", "rclone lsf output bytes=%d elapsed=%s", len(output), time.Since(lsfStart)) candidates = make([]*decryptCandidate, 0) lines := strings.Split(string(output), "\n") @@ -227,12 +229,13 @@ func discoverRcloneBackups(ctx context.Context, remotePath string, logger *loggi logging.DebugStep( logger, "discover rclone backups", - "summary entries=%d empty=%d non_candidate=%d manifest_errors=%d accepted=%d", + "summary entries=%d empty=%d non_candidate=%d manifest_errors=%d accepted=%d elapsed=%s", totalEntries, emptyEntries, nonCandidateEntries, manifestErrors, len(candidates), + time.Since(start), ) logDebug(logger, "Cloud (rclone): scanned %d entries, found %d valid backup candidate(s)", len(lines), len(candidates)) logDebug(logger, "Cloud (rclone): discovered %d bundle candidate(s) in %s", len(candidates), fullPath) diff --git a/internal/orchestrator/decrypt.go b/internal/orchestrator/decrypt.go index a1614da..fa4fbdb 100644 --- a/internal/orchestrator/decrypt.go +++ b/internal/orchestrator/decrypt.go @@ -355,19 +355,32 @@ func inspectBundleManifest(bundlePath string) (*backup.Manifest, error) { func inspectRcloneBundleManifest(ctx context.Context, remotePath string, logger *logging.Logger) (manifest *backup.Manifest, err error) { done := logging.DebugStart(logger, "inspect rclone bundle manifest", "remote=%s", remotePath) defer func() { done(err) }() + start := time.Now() + + // Use a child context so we can stop rclone once the manifest is found. + // This avoids a deadlock when the manifest is near the beginning of the tar: + // if we stop reading stdout early and still Wait(), rclone can block writing + // the remaining bytes into a full pipe. + cmdCtx, cancel := context.WithCancel(ctx) + defer cancel() + logging.DebugStep(logger, "inspect rclone bundle manifest", "executing: rclone cat %s", remotePath) - cmd := exec.CommandContext(ctx, "rclone", "cat", remotePath) + cmd := exec.CommandContext(cmdCtx, "rclone", "cat", remotePath) stdout, err := cmd.StdoutPipe() if err != nil { return nil, fmt.Errorf("open rclone stream: %w", err) } - cmd.Stderr = os.Stderr + defer stdout.Close() + + var stderr bytes.Buffer + cmd.Stderr = &stderr if err := cmd.Start(); err != nil { return nil, fmt.Errorf("start rclone cat: %w", err) } tr := tar.NewReader(stdout) + cancelledEarly := false for { hdr, err := tr.Next() @@ -394,6 +407,8 @@ func inspectRcloneBundleManifest(ctx context.Context, remotePath string, logger } manifest = &m logging.DebugStep(logger, "inspect rclone bundle manifest", "manifest entry=%s bytes=%d", hdr.Name, len(data)) + cancelledEarly = true + cancel() break } } @@ -402,13 +417,25 @@ func inspectRcloneBundleManifest(ctx context.Context, remotePath string, logger // because stopping early may cause rclone to see a broken pipe. waitErr := cmd.Wait() if manifest == nil { + stderrMsg := strings.TrimSpace(stderr.String()) if waitErr != nil { + if stderrMsg != "" { + return nil, fmt.Errorf("manifest not found inside remote bundle (rclone exited with error): %w (stderr: %s)", waitErr, stderrMsg) + } return nil, fmt.Errorf("manifest not found inside remote bundle (rclone exited with error): %w", waitErr) } + if stderrMsg != "" { + return nil, fmt.Errorf("manifest not found inside remote bundle %s (stderr: %s)", filepath.Base(remotePath), stderrMsg) + } return nil, fmt.Errorf("manifest not found inside remote bundle %s", filepath.Base(remotePath)) } if waitErr != nil { - logger.Debug("rclone cat %s completed with non-zero status after manifest read: %v", remotePath, waitErr) + // If we cancelled early, rclone is expected to exit non-zero (broken pipe / killed). + if cancelledEarly { + logDebug(logger, "rclone cat %s stopped early after manifest read: %v (elapsed=%s stderr=%q)", remotePath, waitErr, time.Since(start), strings.TrimSpace(stderr.String())) + } else { + logDebug(logger, "rclone cat %s completed with non-zero status after manifest read: %v (elapsed=%s stderr=%q)", remotePath, waitErr, time.Since(start), strings.TrimSpace(stderr.String())) + } } return manifest, nil diff --git a/internal/orchestrator/decrypt_tui.go b/internal/orchestrator/decrypt_tui.go index ef064f2..a45af1a 100644 --- a/internal/orchestrator/decrypt_tui.go +++ b/internal/orchestrator/decrypt_tui.go @@ -7,6 +7,8 @@ import ( "os" "path/filepath" "strings" + "sync/atomic" + "time" "filippo.io/age" "github.com/gdamore/tcell/v2" @@ -154,6 +156,7 @@ func runDecryptSelectionWizard(ctx context.Context, cfg *config.Config, logger * selection = &decryptSelection{} var selectionErr error var scan scanController + var scanSeq uint64 pathList := tview.NewList().ShowSecondaryText(false) pathList.SetMainTextColor(tcell.ColorWhite). @@ -171,6 +174,7 @@ func runDecryptSelectionWizard(ctx context.Context, cfg *config.Config, logger * return } selectedOption := options[index] + scanID := atomic.AddUint64(&scanSeq, 1) logging.DebugStep(logger, "decrypt selection wizard", "selected source label=%q path=%q rclone=%v", selectedOption.Label, selectedOption.Path, selectedOption.IsRclone) pages.SwitchToPage("loading") go func() { @@ -179,11 +183,18 @@ func runDecryptSelectionWizard(ctx context.Context, cfg *config.Config, logger * var candidates []*decryptCandidate var scanErr error - scanDone := logging.DebugStart(logger, "scan backup source", "path=%s rclone=%v", selectedOption.Path, selectedOption.IsRclone) + scanDone := logging.DebugStart(logger, "scan backup source", "id=%d path=%s rclone=%v", scanID, selectedOption.Path, selectedOption.IsRclone) defer func() { scanDone(scanErr) }() if selectedOption.IsRclone { - candidates, scanErr = discoverRcloneBackups(scanCtx, selectedOption.Path, logger) + timeout := 30 * time.Second + if cfg != nil && cfg.RcloneTimeoutConnection > 0 { + timeout = time.Duration(cfg.RcloneTimeoutConnection) * time.Second + } + logging.DebugStep(logger, "scan backup source", "id=%d rclone_timeout=%s", scanID, timeout) + rcloneCtx, cancel := context.WithTimeout(scanCtx, timeout) + defer cancel() + candidates, scanErr = discoverRcloneBackups(rcloneCtx, selectedOption.Path, logger) } else { candidates, scanErr = discoverBackupCandidates(logger, selectedOption.Path) } @@ -195,6 +206,9 @@ func runDecryptSelectionWizard(ctx context.Context, cfg *config.Config, logger * app.QueueUpdateDraw(func() { if scanErr != nil { message := fmt.Sprintf("Failed to inspect %s: %v", selectedOption.Path, scanErr) + if selectedOption.IsRclone && errors.Is(scanErr, context.DeadlineExceeded) { + message = fmt.Sprintf("Timed out while scanning %s (rclone). Check connectivity/rclone config or increase RCLONE_TIMEOUT_CONNECTION. (%v)", selectedOption.Path, scanErr) + } showErrorModal(app, pages, configPath, buildSig, message, func() { pages.SwitchToPage("paths") }) diff --git a/internal/orchestrator/directory_recreation.go b/internal/orchestrator/directory_recreation.go index 12f4b53..19dc9f3 100644 --- a/internal/orchestrator/directory_recreation.go +++ b/internal/orchestrator/directory_recreation.go @@ -777,6 +777,13 @@ func isLikelyZFSMountPoint(path string, logger *logging.Logger) bool { // setDatastoreOwnership sets ownership to backup:backup for PBS datastores func setDatastoreOwnership(path string, logger *logging.Logger) error { + if os.Geteuid() != 0 { + // Ownership/permission adjustments are best-effort and should not block + // directory recreation when running without privileges (common in CI/tests). + logger.Debug("PBS datastore ownership: running as non-root (euid=%d); skipping chown/chmod for %s", os.Geteuid(), path) + return nil + } + backupUser, err := user.Lookup("backup") if err != nil { // On non-PBS systems the user may not exist; treat as non-fatal. @@ -794,6 +801,10 @@ func setDatastoreOwnership(path string, logger *logging.Logger) error { logger.Debug("PBS datastore ownership: chown %s to backup:backup (uid=%d gid=%d)", path, uid, gid) if err := os.Chown(path, uid, gid); err != nil { + if isIgnorableOwnershipError(err) { + logger.Warning("PBS datastore ownership: unable to chown %s to backup:backup (uid=%d gid=%d): %v (continuing)", path, uid, gid, err) + return nil + } return fmt.Errorf("chown %s: %w", path, err) } @@ -809,6 +820,10 @@ func setDatastoreOwnership(path string, logger *logging.Logger) error { if desired != current { logger.Debug("PBS datastore permissions: chmod %s from %o to %o", path, current, desired) if err := os.Chmod(path, desired); err != nil { + if isIgnorableOwnershipError(err) { + logger.Warning("PBS datastore permissions: unable to chmod %s from %o to %o: %v (continuing)", path, current, desired, err) + return nil + } return fmt.Errorf("chmod %s: %w", path, err) } } @@ -817,6 +832,13 @@ func setDatastoreOwnership(path string, logger *logging.Logger) error { return nil } +func isIgnorableOwnershipError(err error) bool { + // Common "can't chown/chmod here" situations: + // - EPERM/EACCES: not permitted (non-root, user namespace restrictions, etc.) + // - EROFS: read-only filesystem + return errors.Is(err, syscall.EPERM) || errors.Is(err, syscall.EACCES) || errors.Is(err, syscall.EROFS) +} + // RecreateDirectoriesFromConfig recreates storage/datastore directories based on system type func RecreateDirectoriesFromConfig(systemType SystemType, logger *logging.Logger) error { logger.Info("Recreating directory structures from configuration...") diff --git a/internal/orchestrator/directory_recreation_test.go b/internal/orchestrator/directory_recreation_test.go index 198b15a..7b8a827 100644 --- a/internal/orchestrator/directory_recreation_test.go +++ b/internal/orchestrator/directory_recreation_test.go @@ -443,14 +443,14 @@ func TestIsLikelyZFSMountPointNoMatch(t *testing.T) { } } -// Test: isLikelyZFSMountPoint con path contenente "datastore" +// Test: isLikelyZFSMountPoint with a path containing "datastore" func TestIsLikelyZFSMountPointDatastorePath(t *testing.T) { logger := newDirTestLogger() cachePath, restore := overridePath(t, &zpoolCachePath, "zpool.cache") defer restore() writeFile(t, cachePath, "cache") - // Path con "datastore" nel nome dovrebbe matchare + // A path with "datastore" in the name should match if !isLikelyZFSMountPoint("/var/lib/datastore", logger) { t.Fatalf("expected true for path containing 'datastore'") } @@ -459,10 +459,10 @@ func TestIsLikelyZFSMountPointDatastorePath(t *testing.T) { } } -// Test: createPVEStorageStructure ritorna errore se base directory non creabile +// Test: createPVEStorageStructure returns an error if the base directory can't be created func TestCreatePVEStorageStructureBaseError(t *testing.T) { logger := newDirTestLogger() - // Path con carattere nullo non è valido + // A path containing a NUL byte is invalid invalidPath := "/dev/null/cannot/create/here" err := createPVEStorageStructure(invalidPath, "dir", logger) if err == nil { @@ -470,10 +470,10 @@ func TestCreatePVEStorageStructureBaseError(t *testing.T) { } } -// Test: createPBSDatastoreStructure ritorna errore se base directory non creabile +// Test: createPBSDatastoreStructure returns an error if the base directory can't be created func TestCreatePBSDatastoreStructureBaseError(t *testing.T) { logger := newDirTestLogger() - // Override zpoolCachePath per evitare ZFS detection + // Override zpoolCachePath to avoid ZFS detection _, cacheRestore := overridePath(t, &zpoolCachePath, "zpool.cache") defer cacheRestore() @@ -484,10 +484,10 @@ func TestCreatePBSDatastoreStructureBaseError(t *testing.T) { } } -// Test: RecreateDirectoriesFromConfig propaga errore da RecreateStorageDirectories +// Test: RecreateDirectoriesFromConfig propagates an error from RecreateStorageDirectories func TestRecreateDirectoriesFromConfigPVEStatError(t *testing.T) { logger := newDirTestLogger() - // Creiamo una directory e la rendiamo non leggibile per causare errore stat + // Create a directory and make it unreadable to trigger a stat error tmpDir := t.TempDir() cfgDir := filepath.Join(tmpDir, "noperm") if err := os.MkdirAll(cfgDir, 0o000); err != nil { @@ -501,7 +501,7 @@ func TestRecreateDirectoriesFromConfigPVEStatError(t *testing.T) { defer func() { storageCfgPath = prev }() err := RecreateDirectoriesFromConfig(SystemTypePVE, logger) - // Se siamo root, il test non funziona come previsto + // If we're root, the test won't behave as expected if os.Getuid() == 0 { t.Skip("test requires non-root user") } @@ -510,10 +510,10 @@ func TestRecreateDirectoriesFromConfigPVEStatError(t *testing.T) { } } -// Test: RecreateDirectoriesFromConfig propaga errore da RecreateDatastoreDirectories +// Test: RecreateDirectoriesFromConfig propagates an error from RecreateDatastoreDirectories func TestRecreateDirectoriesFromConfigPBSStatError(t *testing.T) { logger := newDirTestLogger() - // Creiamo una directory e la rendiamo non leggibile + // Create a directory and make it unreadable tmpDir := t.TempDir() cfgDir := filepath.Join(tmpDir, "noperm") if err := os.MkdirAll(cfgDir, 0o000); err != nil { @@ -527,7 +527,7 @@ func TestRecreateDirectoriesFromConfigPBSStatError(t *testing.T) { defer func() { datastoreCfgPath = prev }() err := RecreateDirectoriesFromConfig(SystemTypePBS, logger) - // Se siamo root, il test non funziona come previsto + // If we're root, the test won't behave as expected if os.Getuid() == 0 { t.Skip("test requires non-root user") } diff --git a/internal/orchestrator/extensions.go b/internal/orchestrator/extensions.go index 41a12a3..0b2253c 100644 --- a/internal/orchestrator/extensions.go +++ b/internal/orchestrator/extensions.go @@ -175,9 +175,9 @@ func (o *Orchestrator) DispatchEarlyErrorNotification(ctx context.Context, early return stats } -// dispatchNotificationsAndLogs esegue la fase di notifiche e gestione file di log. -// Viene usata sia nel percorso di successo che in quello di errore, così che -// le notifiche vengano comunque inviate e il log venga sempre chiuso/ruotato. +// dispatchNotificationsAndLogs runs the notification phase and log-file management. +// It is used both on success and on error paths, so notifications still go out +// and the log is always closed/rotated. func (o *Orchestrator) dispatchNotificationsAndLogs(ctx context.Context, stats *BackupStats) { if o == nil || o.logger == nil { return diff --git a/internal/orchestrator/network_apply.go b/internal/orchestrator/network_apply.go index e9c073a..22de576 100644 --- a/internal/orchestrator/network_apply.go +++ b/internal/orchestrator/network_apply.go @@ -5,6 +5,7 @@ import ( "context" "errors" "fmt" + "net" "os" "os/exec" "path/filepath" @@ -20,8 +21,12 @@ const defaultNetworkRollbackTimeout = 180 * time.Second var ErrNetworkApplyNotCommitted = errors.New("network configuration not committed") type NetworkApplyNotCommittedError struct { - RollbackLog string - RestoredIP string + RollbackLog string + RollbackMarker string + RestoredIP string + OriginalIP string // IP from backup file (will be restored by rollback) + RollbackArmed bool + RollbackDeadline time.Time // when rollback will execute } func (e *NetworkApplyNotCommittedError) Error() string { @@ -115,11 +120,17 @@ func maybeApplyNetworkConfigCLI(ctx context.Context, reader *bufio.Reader, logge } logging.DebugStep(logger, "network safe apply (cli)", "Prompt: apply network now with rollback timer") - applyNowPrompt := fmt.Sprintf( - "Apply restored network configuration now with automatic rollback (%ds)? (y/N): ", - int(defaultNetworkRollbackTimeout.Seconds()), - ) - applyNow, err := promptYesNo(ctx, reader, applyNowPrompt) + rollbackSeconds := int(defaultNetworkRollbackTimeout.Seconds()) + fmt.Println() + fmt.Println("Network restore: a restored network configuration is ready to apply.") + if strings.TrimSpace(stageRoot) != "" { + fmt.Printf("Source: %s (will be copied to /etc and applied)\n", strings.TrimSpace(stageRoot)) + } + fmt.Println("This will reload networking immediately (no reboot).") + fmt.Println("WARNING: This may change the active IP and disconnect SSH/Web sessions.") + fmt.Printf("After applying, type COMMIT within %ds or ProxSave will roll back automatically.\n", rollbackSeconds) + fmt.Println("Recommendation: run this step from the local console/IPMI, not over SSH.") + applyNow, err := promptYesNoWithCountdown(ctx, reader, logger, "Apply network configuration now?", 90*time.Second, false) if err != nil { return err } @@ -418,42 +429,166 @@ func applyNetworkWithRollbackCLI(ctx context.Context, reader *bufio.Reader, logg logging.DebugStep(logger, "network safe apply (cli)", "Wait for COMMIT (rollback in %ds)", int(remaining.Seconds())) committed, err := promptNetworkCommitWithCountdown(ctx, reader, logger, remaining) if err != nil { - logger.Warning("Commit prompt error: %v", err) + logger.Warning("Commit input lost (%v); rollback remains ARMED and will proceed automatically.", err) + return buildNetworkApplyNotCommittedError(ctx, logger, iface, handle) } logging.DebugStep(logger, "network safe apply (cli)", "User commit result: committed=%v", committed) if committed { + if rollbackAlreadyRunning(ctx, logger, handle) { + logger.Warning("Commit received too late: rollback already running. Network configuration NOT committed.") + return buildNetworkApplyNotCommittedError(ctx, logger, iface, handle) + } disarmNetworkRollback(ctx, logger, handle) logger.Info("Network configuration committed successfully.") return nil } - // Timer window expired: run rollback now so the restore summary can report the final state. - if output, rbErr := restoreCmd.Run(ctx, "sh", handle.scriptPath); rbErr != nil { - if len(output) > 0 { - logger.Debug("Rollback script output: %s", strings.TrimSpace(string(output))) + // Not committed: keep rollback ARMED. Do not disarm. + // The rollback script will run via systemd-run/nohup when the timer expires. + return buildNetworkApplyNotCommittedError(ctx, logger, iface, handle) +} + +// extractIPFromSnapshot reads the IP address for a given interface from a network snapshot report file. +// It searches the output section that follows the "$ ip -br addr" command written by writeNetworkSnapshot. +func extractIPFromSnapshot(path, iface string) string { + if strings.TrimSpace(path) == "" || strings.TrimSpace(iface) == "" { + return "unknown" + } + data, err := restoreFS.ReadFile(path) + if err != nil { + return "unknown" + } + + lines := strings.Split(string(data), "\n") + inAddrSection := false + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "$ ip -br addr" { + inAddrSection = true + continue } - return fmt.Errorf("network apply not committed; rollback failed (log: %s): %w", strings.TrimSpace(handle.logPath), rbErr) - } else if len(output) > 0 { - logger.Debug("Rollback script output: %s", strings.TrimSpace(string(output))) + if strings.HasPrefix(line, "$ ") { + if inAddrSection { + break + } + continue + } + if !inAddrSection || line == "" || strings.HasPrefix(line, "ERROR:") { + continue + } + + fields := strings.Fields(line) + if len(fields) < 3 || fields[0] != iface { + continue + } + + // "ip -br addr" can print multiple addresses; prefer IPv4 when available. + firstIPv6 := "" + for _, token := range fields[2:] { + ip := strings.Split(token, "/")[0] + parsed := net.ParseIP(ip) + if parsed == nil { + continue + } + if parsed.To4() != nil { + return ip + } + if firstIPv6 == "" { + firstIPv6 = ip + } + } + if firstIPv6 != "" { + return firstIPv6 + } + return "unknown" } - disarmNetworkRollback(ctx, logger, handle) + + return "unknown" +} + +func buildNetworkApplyNotCommittedError(ctx context.Context, logger *logging.Logger, iface string, handle *networkRollbackHandle) *NetworkApplyNotCommittedError { + logging.DebugStep(logger, "build not-committed error", "Start: iface=%s handle=%v", iface, handle != nil) restoredIP := "unknown" if strings.TrimSpace(iface) != "" { - deadline := time.Now().Add(5 * time.Second) - for time.Now().Before(deadline) { - ep, err := currentNetworkEndpoint(ctx, iface, 1*time.Second) - if err == nil && len(ep.Addresses) > 0 { - restoredIP = strings.Join(ep.Addresses, ", ") - break - } - time.Sleep(300 * time.Millisecond) + logging.DebugStep(logger, "build not-committed error", "Querying current IP for iface=%s", iface) + if ep, err := currentNetworkEndpoint(ctx, iface, 1*time.Second); err == nil && len(ep.Addresses) > 0 { + restoredIP = strings.Join(ep.Addresses, ", ") + logging.DebugStep(logger, "build not-committed error", "Current IP: %s", restoredIP) + } else if err != nil { + logging.DebugStep(logger, "build not-committed error", "Failed to get IP: %v", err) + } + } + + rollbackArmed := true + logging.DebugStep(logger, "build not-committed error", "Checking rollback marker status") + if handle == nil { + rollbackArmed = false + logging.DebugStep(logger, "build not-committed error", "No handle: rollbackArmed=false") + } else if strings.TrimSpace(handle.markerPath) != "" { + if _, statErr := restoreFS.Stat(handle.markerPath); statErr != nil { + // Marker missing => rollback likely already executed (or was manually removed). + rollbackArmed = false + logging.DebugStep(logger, "build not-committed error", "Marker missing (%s): rollbackArmed=false", handle.markerPath) + } else { + logging.DebugStep(logger, "build not-committed error", "Marker exists (%s): rollbackArmed=true", handle.markerPath) + } + } + + rollbackLog := "" + rollbackMarker := "" + originalIP := "unknown" + var rollbackDeadline time.Time + if handle != nil { + rollbackLog = strings.TrimSpace(handle.logPath) + rollbackMarker = strings.TrimSpace(handle.markerPath) + // Read original IP from before.txt snapshot (IP that will be restored by rollback) + if strings.TrimSpace(handle.workDir) != "" { + beforePath := filepath.Join(handle.workDir, "before.txt") + originalIP = extractIPFromSnapshot(beforePath, iface) + logging.DebugStep(logger, "build not-committed error", "Original IP from %s: %s", beforePath, originalIP) } + // Calculate rollback deadline + rollbackDeadline = handle.armedAt.Add(handle.timeout) + logging.DebugStep(logger, "build not-committed error", "Rollback deadline: %s", rollbackDeadline.Format(time.RFC3339)) } + + logging.DebugStep(logger, "build not-committed error", "Result: ip=%s originalIP=%s armed=%v log=%s", restoredIP, originalIP, rollbackArmed, rollbackLog) return &NetworkApplyNotCommittedError{ - RollbackLog: strings.TrimSpace(handle.logPath), - RestoredIP: strings.TrimSpace(restoredIP), + RollbackLog: rollbackLog, + RollbackMarker: rollbackMarker, + RestoredIP: strings.TrimSpace(restoredIP), + OriginalIP: originalIP, + RollbackArmed: rollbackArmed, + RollbackDeadline: rollbackDeadline, + } +} + +func rollbackAlreadyRunning(ctx context.Context, logger *logging.Logger, handle *networkRollbackHandle) bool { + if handle == nil || strings.TrimSpace(handle.unitName) == "" { + logging.DebugStep(logger, "rollback already running", "Skip check: handle=%v unitName=%q", handle != nil, "") + return false + } + if !commandAvailable("systemctl") { + logging.DebugStep(logger, "rollback already running", "Skip check: systemctl not available") + return false + } + + serviceUnit := strings.TrimSpace(handle.unitName) + ".service" + logging.DebugStep(logger, "rollback already running", "Checking systemctl is-active %s", serviceUnit) + + checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + out, err := restoreCmd.Run(checkCtx, "systemctl", "is-active", serviceUnit) + if err != nil { + logging.DebugStep(logger, "rollback already running", "systemctl is-active failed: %v (assuming not running)", err) + return false } + + state := strings.TrimSpace(string(out)) + running := state == "active" || state == "activating" + logging.DebugStep(logger, "rollback already running", "Service state=%s running=%v", state, running) + return running } func armNetworkRollback(ctx context.Context, logger *logging.Logger, backupPath string, timeout time.Duration, workDir string) (handle *networkRollbackHandle, err error) { @@ -466,6 +601,7 @@ func armNetworkRollback(ctx context.Context, logger *logging.Logger, backupPath if timeout <= 0 { return nil, fmt.Errorf("invalid rollback timeout") } + logging.DebugStep(logger, "arm network rollback", "Parameters validated: backup=%s timeout=%s", backupPath, timeout) logging.DebugStep(logger, "arm network rollback", "Prepare rollback work directory") baseDir := strings.TrimSpace(workDir) @@ -487,17 +623,20 @@ func armNetworkRollback(ctx context.Context, logger *logging.Logger, backupPath armedAt: time.Now(), timeout: timeout, } + logging.DebugStep(logger, "arm network rollback", "Handle created: marker=%s script=%s log=%s", handle.markerPath, handle.scriptPath, handle.logPath) logging.DebugStep(logger, "arm network rollback", "Write rollback marker: %s", handle.markerPath) if err := restoreFS.WriteFile(handle.markerPath, []byte("pending\n"), 0o640); err != nil { return nil, fmt.Errorf("write rollback marker: %w", err) } + logging.DebugStep(logger, "arm network rollback", "Marker written successfully") logging.DebugStep(logger, "arm network rollback", "Write rollback script: %s", handle.scriptPath) script := buildRollbackScript(handle.markerPath, backupPath, handle.logPath, true) if err := restoreFS.WriteFile(handle.scriptPath, []byte(script), 0o640); err != nil { return nil, fmt.Errorf("write rollback script: %w", err) } + logging.DebugStep(logger, "arm network rollback", "Script written successfully (%d bytes)", len(script)) timeoutSeconds := int(timeout.Seconds()) if timeoutSeconds < 1 { @@ -517,8 +656,11 @@ func armNetworkRollback(ctx context.Context, logger *logging.Logger, backupPath logger.Warning("systemd-run failed, falling back to background timer: %v", err) logger.Debug("systemd-run output: %s", strings.TrimSpace(string(output))) handle.unitName = "" - } else if len(output) > 0 { - logger.Debug("systemd-run output: %s", strings.TrimSpace(string(output))) + } else { + logging.DebugStep(logger, "arm network rollback", "Timer armed via systemd-run: unit=%s", handle.unitName) + if len(output) > 0 { + logger.Debug("systemd-run output: %s", strings.TrimSpace(string(output))) + } } } @@ -529,6 +671,7 @@ func armNetworkRollback(ctx context.Context, logger *logging.Logger, backupPath logger.Debug("Background rollback output: %s", strings.TrimSpace(string(output))) return nil, fmt.Errorf("failed to arm rollback timer: %w", err) } + logging.DebugStep(logger, "arm network rollback", "Timer armed via nohup (fallback)") } logger.Info("Rollback timer armed (%ds). Work dir: %s (log: %s)", timeoutSeconds, baseDir, handle.logPath) @@ -537,19 +680,37 @@ func armNetworkRollback(ctx context.Context, logger *logging.Logger, backupPath func disarmNetworkRollback(ctx context.Context, logger *logging.Logger, handle *networkRollbackHandle) { if handle == nil { + logging.DebugStep(logger, "disarm network rollback", "Skip: handle is nil") return } - logging.DebugStep(logger, "disarm network rollback", "Disarming rollback (marker=%s unit=%s)", strings.TrimSpace(handle.markerPath), strings.TrimSpace(handle.unitName)) - if handle.markerPath != "" { + + logging.DebugStep(logger, "disarm network rollback", "Start: marker=%s unit=%s", strings.TrimSpace(handle.markerPath), strings.TrimSpace(handle.unitName)) + + // Remove marker first so that even if the timer triggers concurrently the rollback script exits early. + if strings.TrimSpace(handle.markerPath) != "" { + logging.DebugStep(logger, "disarm network rollback", "Removing marker file: %s", handle.markerPath) if err := restoreFS.Remove(handle.markerPath); err != nil && !errors.Is(err, os.ErrNotExist) { - logger.Debug("Failed to remove rollback marker %s: %v", handle.markerPath, err) + logger.Warning("Failed to remove rollback marker %s: %v", handle.markerPath, err) + } else { + logging.DebugStep(logger, "disarm network rollback", "Marker removed successfully") } } - if handle.unitName != "" && commandAvailable("systemctl") { - if output, err := restoreCmd.Run(ctx, "systemctl", "stop", handle.unitName); err != nil { - logger.Debug("Failed to stop rollback unit %s: %v (output: %s)", handle.unitName, err, strings.TrimSpace(string(output))) + + if strings.TrimSpace(handle.unitName) != "" && commandAvailable("systemctl") { + // Stop the timer only. If the service already started, let it finish. + timerUnit := strings.TrimSpace(handle.unitName) + ".timer" + logging.DebugStep(logger, "disarm network rollback", "Stopping timer: %s", timerUnit) + if output, err := restoreCmd.Run(ctx, "systemctl", "stop", timerUnit); err != nil { + logger.Warning("Failed to stop rollback timer %s: %v (output: %s)", timerUnit, err, strings.TrimSpace(string(output))) + } else { + logging.DebugStep(logger, "disarm network rollback", "Timer stopped successfully") } + + logging.DebugStep(logger, "disarm network rollback", "Resetting failed units") + _, _ = restoreCmd.Run(ctx, "systemctl", "reset-failed", strings.TrimSpace(handle.unitName)+".service", timerUnit) } + + logging.DebugStep(logger, "disarm network rollback", "Disarm complete") } func maybeRepairNICNamesCLI(ctx context.Context, reader *bufio.Reader, logger *logging.Logger, archivePath string) *nicRepairResult { @@ -739,18 +900,24 @@ func defaultNetworkPortChecks(systemType SystemType) []tcpPortCheck { } func promptNetworkCommitWithCountdown(ctx context.Context, reader *bufio.Reader, logger *logging.Logger, remaining time.Duration) (bool, error) { + logging.DebugStep(logger, "prompt commit", "Start: remaining=%s", remaining) + if remaining <= 0 { + logging.DebugStep(logger, "prompt commit", "No time remaining, returning timeout") return false, context.DeadlineExceeded } - fmt.Printf("Type COMMIT within %d seconds to keep the new network configuration.\n", int(remaining.Seconds())) deadline := time.Now().Add(remaining) + logging.DebugStep(logger, "prompt commit", "Deadline set: %s", deadline.Format(time.RFC3339)) + + fmt.Printf("Type COMMIT within %d seconds to keep the new network configuration.\n", int(remaining.Seconds())) ctxTimeout, cancel := context.WithDeadline(ctx, deadline) defer cancel() inputCh := make(chan string, 1) errCh := make(chan error, 1) + logging.DebugStep(logger, "prompt commit", "Starting input reader goroutine") go func() { line, err := input.ReadLineWithContext(ctxTimeout, reader) if err != nil { @@ -763,6 +930,8 @@ func promptNetworkCommitWithCountdown(ctx context.Context, reader *bufio.Reader, ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() + logging.DebugStep(logger, "prompt commit", "Waiting for user input...") + for { select { case <-ticker.C: @@ -773,20 +942,27 @@ func promptNetworkCommitWithCountdown(ctx context.Context, reader *bufio.Reader, fmt.Fprintf(os.Stderr, "\rRollback in %ds... Type COMMIT to keep: ", int(left.Seconds())) if left <= 0 { fmt.Fprintln(os.Stderr) + logging.DebugStep(logger, "prompt commit", "Timeout expired, returning DeadlineExceeded") return false, context.DeadlineExceeded } case line := <-inputCh: fmt.Fprintln(os.Stderr) - if strings.EqualFold(strings.TrimSpace(line), "commit") { + trimmedLine := strings.TrimSpace(line) + logging.DebugStep(logger, "prompt commit", "User input received: %q", trimmedLine) + if strings.EqualFold(trimmedLine, "commit") { + logging.DebugStep(logger, "prompt commit", "Result: COMMITTED") return true, nil } + logging.DebugStep(logger, "prompt commit", "Result: NOT COMMITTED (input was not 'commit')") return false, nil case err := <-errCh: fmt.Fprintln(os.Stderr) + logging.DebugStep(logger, "prompt commit", "Input error received: %v", err) if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { + logging.DebugStep(logger, "prompt commit", "Result: context deadline/canceled") return false, err } - logger.Debug("Commit input error: %v", err) + logging.DebugStep(logger, "prompt commit", "Result: NOT COMMITTED (input error)") return false, err } } @@ -851,41 +1027,70 @@ func buildRollbackScript(markerPath, backupPath, logPath string, restartNetworki fmt.Sprintf("LOG=%s", shellQuote(logPath)), fmt.Sprintf("MARKER=%s", shellQuote(markerPath)), fmt.Sprintf("BACKUP=%s", shellQuote(backupPath)), - `if [ ! -f "$MARKER" ]; then exit 0; fi`, - `echo "Rollback started at $(date -Is)" >> "$LOG"`, - `echo "Rollback backup: $BACKUP" >> "$LOG"`, - `echo "Extract phase: restore files from rollback archive" >> "$LOG"`, + // Header + `echo "[INFO] ========================================" >> "$LOG"`, + `echo "[INFO] NETWORK ROLLBACK SCRIPT STARTED" >> "$LOG"`, + `echo "[INFO] Timestamp: $(date -Is)" >> "$LOG"`, + `echo "[INFO] Marker: $MARKER" >> "$LOG"`, + `echo "[INFO] Backup: $BACKUP" >> "$LOG"`, + `echo "[INFO] ========================================" >> "$LOG"`, + // Check marker + `echo "[DEBUG] Checking marker file..." >> "$LOG"`, + `if [ ! -f "$MARKER" ]; then`, + ` echo "[INFO] Marker not found - rollback cancelled (already disarmed)" >> "$LOG"`, + ` echo "[INFO] ========================================" >> "$LOG"`, + ` echo "[INFO] ROLLBACK SCRIPT EXITED (no-op)" >> "$LOG"`, + ` echo "[INFO] ========================================" >> "$LOG"`, + ` exit 0`, + `fi`, + `echo "[DEBUG] Marker exists, proceeding with rollback" >> "$LOG"`, + // Extract phase + `echo "[INFO] --- EXTRACT PHASE ---" >> "$LOG"`, + `echo "[DEBUG] Executing: tar -xzf $BACKUP -C /" >> "$LOG"`, `TAR_OK=0`, - `if tar -xzf "$BACKUP" -C / >> "$LOG" 2>&1; then TAR_OK=1; echo "Extract phase: OK" >> "$LOG"; else echo "WARN: failed to extract rollback archive; skipping prune phase" >> "$LOG"; fi`, + `if tar -xzf "$BACKUP" -C / >> "$LOG" 2>&1; then`, + ` TAR_OK=1`, + ` echo "[OK] Extract phase completed successfully" >> "$LOG"`, + `else`, + ` RC=$?`, + ` echo "[ERROR] Extract phase failed (exit=$RC) - skipping prune phase" >> "$LOG"`, + `fi`, + // Prune phase `if [ "$TAR_OK" -eq 1 ] && [ -d /etc/network ]; then`, - ` echo "Prune phase: removing files created after backup (network-only)" >> "$LOG"`, - ` echo "Prune scope: /etc/network (+ /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg, /etc/dnsmasq.d/lxc-vmbr1.conf)" >> "$LOG"`, + ` echo "[INFO] --- PRUNE PHASE ---" >> "$LOG"`, + ` echo "[DEBUG] Scope: /etc/network (+ /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg, /etc/dnsmasq.d/lxc-vmbr1.conf)" >> "$LOG"`, ` (`, ` set +e`, + ` echo "[DEBUG] Creating temp files for prune operation..."`, ` MANIFEST_ALL=$(mktemp /tmp/proxsave/network_rollback_manifest_all_XXXXXX 2>/dev/null)`, ` MANIFEST=$(mktemp /tmp/proxsave/network_rollback_manifest_XXXXXX 2>/dev/null)`, ` CANDIDATES=$(mktemp /tmp/proxsave/network_rollback_candidates_XXXXXX 2>/dev/null)`, ` CLEANUP=$(mktemp /tmp/proxsave/network_rollback_cleanup_XXXXXX 2>/dev/null)`, ` if [ -z "$MANIFEST_ALL" ] || [ -z "$MANIFEST" ] || [ -z "$CANDIDATES" ] || [ -z "$CLEANUP" ]; then`, - ` echo "WARN: mktemp failed; skipping prune"`, + ` echo "[WARN] mktemp failed - skipping prune phase"`, ` exit 0`, ` fi`, - ` echo "Listing rollback archive contents..."`, + ` echo "[DEBUG] Temp files created: manifest_all=$MANIFEST_ALL manifest=$MANIFEST candidates=$CANDIDATES cleanup=$CLEANUP"`, + ` echo "[DEBUG] Listing rollback archive contents..."`, ` if ! tar -tzf "$BACKUP" > "$MANIFEST_ALL"; then`, - ` echo "WARN: failed to list rollback archive; skipping prune"`, + ` echo "[WARN] Failed to list rollback archive - skipping prune phase"`, ` rm -f "$MANIFEST_ALL" "$MANIFEST" "$CANDIDATES" "$CLEANUP"`, ` exit 0`, ` fi`, - ` echo "Normalizing manifest paths..."`, + ` MANIFEST_COUNT=$(wc -l < "$MANIFEST_ALL")`, + ` echo "[DEBUG] Archive contains $MANIFEST_COUNT entries"`, + ` echo "[DEBUG] Normalizing manifest paths..."`, ` sed 's#^\./##' "$MANIFEST_ALL" > "$MANIFEST"`, ` if ! grep -q '^etc/network/' "$MANIFEST"; then`, - ` echo "WARN: rollback archive does not include etc/network; skipping prune"`, + ` echo "[WARN] Rollback archive does not include etc/network - skipping prune phase"`, ` rm -f "$MANIFEST_ALL" "$MANIFEST" "$CANDIDATES" "$CLEANUP"`, ` exit 0`, ` fi`, - ` echo "Scanning current filesystem under /etc/network..."`, + ` echo "[DEBUG] Scanning current filesystem under /etc/network..."`, ` find /etc/network -mindepth 1 \( -type f -o -type l \) -print > "$CANDIDATES" 2>/dev/null || true`, - ` echo "Computing cleanup list (present on disk, absent in backup)..."`, + ` CANDIDATES_COUNT=$(wc -l < "$CANDIDATES")`, + ` echo "[DEBUG] Found $CANDIDATES_COUNT files/links on disk"`, + ` echo "[DEBUG] Computing cleanup list (present on disk, absent in backup)..."`, ` : > "$CLEANUP"`, ` while IFS= read -r path; do`, ` rel=${path#/}`, @@ -902,15 +1107,22 @@ func buildRollbackScript(markerPath, backupPath, logPath string, restartNetworki ` fi`, ` done`, ` if [ -s "$CLEANUP" ]; then`, - ` echo "Pruning extraneous network files (not present in backup):"`, + ` CLEANUP_COUNT=$(wc -l < "$CLEANUP")`, + ` echo "[DEBUG] Found $CLEANUP_COUNT extraneous files to prune:"`, ` cat "$CLEANUP"`, + ` echo "[DEBUG] Removing extraneous files..."`, + ` REMOVED=0`, ` while IFS= read -r rmPath; do`, - ` rm -f -- "$rmPath" || true`, + ` if rm -f -- "$rmPath"; then`, + ` REMOVED=$((REMOVED+1))`, + ` else`, + ` echo "[WARN] Failed to remove: $rmPath"`, + ` fi`, ` done < "$CLEANUP"`, + ` echo "[OK] Prune phase completed - removed $REMOVED files"`, ` else`, - ` echo "No extraneous network files to prune."`, + ` echo "[OK] Prune phase completed - no extraneous files to remove"`, ` fi`, - ` echo "Prune phase: done"`, ` rm -f "$MANIFEST_ALL" "$MANIFEST" "$CANDIDATES" "$CLEANUP"`, ` ) >> "$LOG" 2>&1 || true`, `fi`, @@ -918,19 +1130,59 @@ func buildRollbackScript(markerPath, backupPath, logPath string, restartNetworki if restartNetworking { lines = append(lines, - `echo "Restart networking after rollback" >> "$LOG"`, - `if command -v ifreload >/dev/null 2>&1; then ifreload -a >> "$LOG" 2>&1 || true;`, - `elif command -v systemctl >/dev/null 2>&1; then systemctl restart networking >> "$LOG" 2>&1 || true;`, - `elif command -v ifup >/dev/null 2>&1; then ifup -a >> "$LOG" 2>&1 || true;`, + `echo "[INFO] Restart networking after rollback" >> "$LOG"`, + `echo "[INFO] Live state before reload:" >> "$LOG"`, + `ip -br addr >> "$LOG" 2>&1 || true`, + `ip route show >> "$LOG" 2>&1 || true`, + `RELOAD_OK=0`, + `if command -v ifreload >/dev/null 2>&1; then`, + ` echo "[INFO] Executing: ifreload -a" >> "$LOG"`, + ` if ifreload -a >> "$LOG" 2>&1; then`, + ` RELOAD_OK=1`, + ` echo "[OK] ifreload -a completed successfully" >> "$LOG"`, + ` else`, + ` RC=$?`, + ` echo "[ERROR] ifreload -a failed (exit=$RC)" >> "$LOG"`, + ` fi`, + `fi`, + `if [ "$RELOAD_OK" -eq 0 ] && command -v systemctl >/dev/null 2>&1; then`, + ` echo "[INFO] Executing fallback: systemctl restart networking" >> "$LOG"`, + ` if systemctl restart networking >> "$LOG" 2>&1; then`, + ` RELOAD_OK=1`, + ` echo "[OK] systemctl restart networking completed successfully" >> "$LOG"`, + ` else`, + ` RC=$?`, + ` echo "[ERROR] systemctl restart networking failed (exit=$RC)" >> "$LOG"`, + ` fi`, + `fi`, + `if [ "$RELOAD_OK" -eq 0 ] && command -v ifup >/dev/null 2>&1; then`, + ` echo "[INFO] Executing fallback: ifup -a" >> "$LOG"`, + ` if ifup -a >> "$LOG" 2>&1; then`, + ` RELOAD_OK=1`, + ` echo "[OK] ifup -a completed successfully" >> "$LOG"`, + ` else`, + ` RC=$?`, + ` echo "[ERROR] ifup -a failed (exit=$RC)" >> "$LOG"`, + ` fi`, + `fi`, + `if [ "$RELOAD_OK" -eq 0 ]; then`, + ` echo "[WARN] All network reload methods failed - network may not be properly configured" >> "$LOG"`, `fi`, + `echo "[INFO] Live state after reload:" >> "$LOG"`, + `ip -br addr >> "$LOG" 2>&1 || true`, + `ip route show >> "$LOG" 2>&1 || true`, ) } else { - lines = append(lines, `echo "Restart networking after rollback: skipped (manual)" >> "$LOG"`) + lines = append(lines, `echo "[INFO] Restart networking after rollback: skipped (manual)" >> "$LOG"`) } lines = append(lines, + `echo "[DEBUG] Removing marker file..." >> "$LOG"`, `rm -f "$MARKER"`, - `echo "Rollback finished at $(date -Is)" >> "$LOG"`, + `echo "[INFO] ========================================" >> "$LOG"`, + `echo "[INFO] NETWORK ROLLBACK SCRIPT FINISHED" >> "$LOG"`, + `echo "[INFO] Timestamp: $(date -Is)" >> "$LOG"`, + `echo "[INFO] ========================================" >> "$LOG"`, ) return strings.Join(lines, "\n") + "\n" } diff --git a/internal/orchestrator/network_diagnostics.go b/internal/orchestrator/network_diagnostics.go index 42d2a5e..1509f17 100644 --- a/internal/orchestrator/network_diagnostics.go +++ b/internal/orchestrator/network_diagnostics.go @@ -1,9 +1,14 @@ package orchestrator import ( + "bufio" "context" + "crypto/sha256" + "encoding/hex" "fmt" + "os" "path/filepath" + "sort" "strings" "sync/atomic" "time" @@ -13,6 +18,8 @@ import ( var networkDiagnosticsSequence uint64 +const snapshotLogMaxExtractedLines = 20 + func createNetworkDiagnosticsDir() (string, error) { baseDir := "/tmp/proxsave" if err := restoreFS.MkdirAll(baseDir, 0o755); err != nil { @@ -45,6 +52,7 @@ func writeNetworkSnapshot(ctx context.Context, logger *logging.Logger, diagnosti b.WriteString(fmt.Sprintf("GeneratedAt: %s\n", nowRestore().Format(time.RFC3339))) b.WriteString(fmt.Sprintf("Label: %s\n\n", label)) + b.WriteString("=== LIVE NETWORK STATE ===\n\n") commands := [][]string{ {"ip", "-br", "link"}, {"ip", "-br", "addr"}, @@ -60,6 +68,9 @@ func writeNetworkSnapshot(ctx context.Context, logger *logging.Logger, diagnosti ctxTimeout, cancel := context.WithTimeout(ctx, timeout) out, err := restoreCmd.Run(ctxTimeout, cmd[0], cmd[1:]...) cancel() + if logger != nil { + debugLogCommandResult(logger, "network snapshot", label, cmd, out, err) + } if len(out) > 0 { b.Write(out) if out[len(out)-1] != '\n' { @@ -75,6 +86,23 @@ func writeNetworkSnapshot(ctx context.Context, logger *logging.Logger, diagnosti b.WriteString("\n") } + b.WriteString("=== ON-DISK NETWORK CONFIG ===\n\n") + appendFileSnapshot(logger, label, &b, "/etc/network/interfaces", 64*1024) + appendDirSnapshot(logger, label, &b, "/etc/network/interfaces.d", 64*1024) + appendFileSnapshot(logger, label, &b, "/etc/hosts", 64*1024) + appendFileSnapshot(logger, label, &b, "/etc/hostname", 8*1024) + appendFileSnapshot(logger, label, &b, "/etc/resolv.conf", 16*1024) + + b.WriteString("=== NETWORK STACK SERVICES ===\n\n") + appendCommandSnapshot(ctx, logger, &b, timeout, []string{"systemctl", "is-active", "networking"}) + appendCommandSnapshot(ctx, logger, &b, timeout, []string{"systemctl", "is-active", "systemd-networkd"}) + appendCommandSnapshot(ctx, logger, &b, timeout, []string{"systemctl", "is-active", "NetworkManager"}) + b.WriteString("\n") + + b.WriteString("=== ifupdown (ifquery --running -a) ===\n\n") + appendCommandSnapshot(ctx, logger, &b, timeout, []string{"ifquery", "--running", "-a"}) + b.WriteString("\n") + if err := restoreFS.WriteFile(path, []byte(b.String()), 0o600); err != nil { return "", err } @@ -82,6 +110,221 @@ func writeNetworkSnapshot(ctx context.Context, logger *logging.Logger, diagnosti return path, nil } +func appendCommandSnapshot(ctx context.Context, logger *logging.Logger, b *strings.Builder, timeout time.Duration, cmd []string) { + if len(cmd) == 0 { + return + } + b.WriteString("$ " + strings.Join(cmd, " ") + "\n") + ctxTimeout, cancel := context.WithTimeout(ctx, timeout) + out, err := restoreCmd.Run(ctxTimeout, cmd[0], cmd[1:]...) + cancel() + if logger != nil { + debugLogCommandResult(logger, "network snapshot", "command", cmd, out, err) + } + if len(out) > 0 { + b.Write(out) + if out[len(out)-1] != '\n' { + b.WriteString("\n") + } + } + if err != nil { + b.WriteString(fmt.Sprintf("ERROR: %v\n", err)) + if logger != nil { + logger.Debug("Network snapshot command failed: %s: %v", strings.Join(cmd, " "), err) + } + } + b.WriteString("\n") +} + +func appendFileSnapshot(logger *logging.Logger, label string, b *strings.Builder, path string, maxBytes int) { + b.WriteString("## File: " + path + "\n") + if logger != nil { + logging.DebugStep(logger, "network snapshot", "Read file (%s): %s", label, path) + } + info, err := restoreFS.Stat(path) + if err != nil { + b.WriteString(fmt.Sprintf("ERROR: %v\n\n", err)) + if logger != nil { + logging.DebugStep(logger, "network snapshot", "Stat failed (%s): %s: %v", label, path, err) + } + return + } + b.WriteString(fmt.Sprintf("Mode: %s\n", info.Mode().String())) + b.WriteString(fmt.Sprintf("Size: %d\n", info.Size())) + b.WriteString(fmt.Sprintf("ModTime: %s\n\n", info.ModTime().Format(time.RFC3339))) + data, err := restoreFS.ReadFile(path) + if err != nil { + b.WriteString(fmt.Sprintf("ERROR: %v\n\n", err)) + if logger != nil { + logging.DebugStep(logger, "network snapshot", "Read failed (%s): %s: %v", label, path, err) + } + return + } + if logger != nil { + debugLogOnDiskFile(logger, label, path, info, data) + } + if maxBytes > 0 && len(data) > maxBytes { + b.Write(data[:maxBytes]) + if maxBytes > 0 && (len(data) == 0 || data[maxBytes-1] != '\n') { + b.WriteString("\n") + } + b.WriteString(fmt.Sprintf("\n[truncated: %d of %d bytes]\n\n", maxBytes, len(data))) + return + } + b.Write(data) + if len(data) > 0 && data[len(data)-1] != '\n' { + b.WriteString("\n") + } + b.WriteString("\n") +} + +func appendDirSnapshot(logger *logging.Logger, label string, b *strings.Builder, dir string, maxBytesPerFile int) { + b.WriteString("## Dir: " + dir + "\n") + if logger != nil { + logging.DebugStep(logger, "network snapshot", "Read dir (%s): %s", label, dir) + } + entries, err := restoreFS.ReadDir(dir) + if err != nil { + b.WriteString(fmt.Sprintf("ERROR: %v\n\n", err)) + if logger != nil { + logging.DebugStep(logger, "network snapshot", "ReadDir failed (%s): %s: %v", label, dir, err) + } + return + } + if len(entries) == 0 { + b.WriteString("(empty)\n\n") + if logger != nil { + logging.DebugStep(logger, "network snapshot", "Dir empty (%s): %s", label, dir) + } + return + } + + type entryInfo struct { + name string + mode os.FileMode + } + var list []entryInfo + for _, e := range entries { + info, err := e.Info() + if err != nil { + continue + } + list = append(list, entryInfo{name: e.Name(), mode: info.Mode()}) + } + sort.Slice(list, func(i, j int) bool { return list[i].name < list[j].name }) + if logger != nil { + names := make([]string, 0, len(list)) + for _, e := range list { + names = append(names, e.name) + } + logging.DebugStep(logger, "network snapshot", "Dir entries (%s): %s: %s", label, dir, strings.Join(names, ", ")) + } + for _, e := range list { + b.WriteString(fmt.Sprintf("- %s (%s)\n", e.name, e.mode.String())) + } + b.WriteString("\n") + + for _, e := range list { + full := filepath.Join(dir, e.name) + if e.mode.IsRegular() || (e.mode&os.ModeSymlink != 0) { + appendFileSnapshot(logger, label, b, full, maxBytesPerFile) + } + } +} + +func debugLogCommandResult(logger *logging.Logger, operation, label string, cmd []string, out []byte, err error) { + if logger == nil || len(cmd) == 0 { + return + } + cmdStr := strings.Join(cmd, " ") + if err != nil { + logging.DebugStep(logger, operation, "Command error (%s): %s: %v", label, cmdStr, err) + } else { + logging.DebugStep(logger, operation, "Command ok (%s): %s", label, cmdStr) + } + if len(out) == 0 { + return + } + s := strings.TrimSpace(string(out)) + if s == "" { + return + } + preview := s + if len(preview) > 800 { + preview = preview[:800] + "…" + } + logging.DebugStep(logger, operation, "Command output (%s): %s: %s", label, cmdStr, preview) +} + +func debugLogOnDiskFile(logger *logging.Logger, label, path string, info os.FileInfo, data []byte) { + if logger == nil || info == nil { + return + } + sum := sha256.Sum256(data) + sha := hex.EncodeToString(sum[:]) + logging.DebugStep(logger, "network snapshot", "File ok (%s): %s mode=%s size=%d mtime=%s sha256=%s", label, path, info.Mode().String(), info.Size(), info.ModTime().Format(time.RFC3339), sha) + + extracted := extractInterestingLines(path, data, snapshotLogMaxExtractedLines) + if len(extracted) == 0 { + return + } + for _, line := range extracted { + logging.DebugStep(logger, "network snapshot", "File excerpt (%s): %s: %s", label, path, line) + } +} + +func extractInterestingLines(path string, data []byte, limit int) []string { + if limit <= 0 { + return nil + } + var out []string + scanner := bufio.NewScanner(strings.NewReader(string(data))) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + if isInterestingNetworkLine(path, line) { + out = append(out, line) + if len(out) >= limit { + break + } + } + } + return out +} + +func isInterestingNetworkLine(path, line string) bool { + switch path { + case "/etc/network/interfaces": + return strings.HasPrefix(line, "auto ") || + strings.HasPrefix(line, "iface ") || + strings.HasPrefix(line, "allow-hotplug ") || + strings.HasPrefix(line, "address ") || + strings.HasPrefix(line, "gateway ") || + strings.HasPrefix(line, "bridge-") || + strings.HasPrefix(line, "source ") + case "/etc/resolv.conf": + return strings.HasPrefix(line, "nameserver ") || + strings.HasPrefix(line, "search ") || + strings.HasPrefix(line, "domain ") + case "/etc/hostname": + return true + case "/etc/hosts": + if strings.HasPrefix(line, "127.") || strings.HasPrefix(line, "::1") { + return false + } + return strings.HasPrefix(line, "10.") || + strings.HasPrefix(line, "172.") || + strings.HasPrefix(line, "192.") || + strings.Contains(line, ".local") || + strings.Contains(line, "pbs") || + strings.Contains(line, "pve") + default: + return true + } +} + func writeNetworkHealthReportFile(diagnosticsDir string, report networkHealthReport) (string, error) { return writeNetworkHealthReportFileNamed(diagnosticsDir, "health_after.txt", report) } diff --git a/internal/orchestrator/prompts_cli.go b/internal/orchestrator/prompts_cli.go index 7958157..4f4c03a 100644 --- a/internal/orchestrator/prompts_cli.go +++ b/internal/orchestrator/prompts_cli.go @@ -3,10 +3,14 @@ package orchestrator import ( "bufio" "context" + "errors" "fmt" + "os" "strings" + "time" "github.com/tis24dev/proxsave/internal/input" + "github.com/tis24dev/proxsave/internal/logging" ) func promptYesNo(ctx context.Context, reader *bufio.Reader, prompt string) (bool, error) { @@ -42,3 +46,87 @@ func promptYesNoWithDefault(ctx context.Context, reader *bufio.Reader, prompt st } } } + +// promptYesNoWithCountdown prompts the user for a yes/no answer while showing a visible countdown. +// If no input is received before the timeout expires, it proceeds safely with "No". +// Pressing Enter chooses the provided defaultYes. +func promptYesNoWithCountdown(ctx context.Context, reader *bufio.Reader, logger *logging.Logger, question string, timeout time.Duration, defaultYes bool) (bool, error) { + defStr := "[Y/n]" + if !defaultYes { + defStr = "[y/N]" + } + + question = strings.TrimSpace(question) + if question == "" { + question = "Proceed?" + } + if timeout <= 0 { + return promptYesNoWithDefault(ctx, reader, fmt.Sprintf("%s %s ", question, defStr), defaultYes) + } + + deadline := time.Now().Add(timeout) + logging.DebugStep(logger, "prompt yes/no", "Start: question=%q defaultYes=%v timeout=%s", question, defaultYes, timeout) + + ctxTimeout, cancel := context.WithDeadline(ctx, deadline) + defer cancel() + + inputCh := make(chan string, 1) + errCh := make(chan error, 1) + + go func() { + line, err := input.ReadLineWithContext(ctxTimeout, reader) + if err != nil { + errCh <- err + return + } + inputCh <- line + }() + + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + left := time.Until(deadline) + if left < 0 { + left = 0 + } + fmt.Fprintf(os.Stderr, "\rAuto-skip in %ds... %s %s ", int(left.Seconds()), question, defStr) + if left <= 0 { + fmt.Fprintln(os.Stderr) + logging.DebugStep(logger, "prompt yes/no", "Timeout expired: proceeding with No") + logger.Info("No response within %ds; proceeding with No.", int(timeout.Seconds())) + return false, nil + } + case line := <-inputCh: + fmt.Fprintln(os.Stderr) + trimmed := strings.ToLower(strings.TrimSpace(line)) + logging.DebugStep(logger, "prompt yes/no", "User input received: %q", trimmed) + switch trimmed { + case "": + return defaultYes, nil + case "y", "yes": + return true, nil + case "n", "no": + return false, nil + default: + logger.Info("Unrecognized input %q; proceeding with No.", strings.TrimSpace(line)) + return false, nil + } + case err := <-errCh: + fmt.Fprintln(os.Stderr) + if errors.Is(err, context.DeadlineExceeded) { + logging.DebugStep(logger, "prompt yes/no", "Input timed out: proceeding with No") + logger.Info("No response within %ds; proceeding with No.", int(timeout.Seconds())) + return false, nil + } + if errors.Is(err, context.Canceled) { + logging.DebugStep(logger, "prompt yes/no", "Input canceled: %v", err) + return false, err + } + logging.DebugStep(logger, "prompt yes/no", "Input error: %v", err) + return false, err + } + } +} diff --git a/internal/orchestrator/restore.go b/internal/orchestrator/restore.go index 4a0e426..d50d267 100644 --- a/internal/orchestrator/restore.go +++ b/internal/orchestrator/restore.go @@ -38,6 +38,28 @@ var ( prepareDecryptedBackupFunc = prepareDecryptedBackup ) +// RestoreAbortInfo contains information about an aborted restore with network rollback. +type RestoreAbortInfo struct { + NetworkRollbackArmed bool + NetworkRollbackLog string + NetworkRollbackMarker string + OriginalIP string // IP from backup file (will be restored by rollback) + CurrentIP string // IP after apply (before rollback) + RollbackDeadline time.Time // when rollback will execute +} + +var lastRestoreAbortInfo *RestoreAbortInfo + +// GetLastRestoreAbortInfo returns info about the last restore abort, if any. +func GetLastRestoreAbortInfo() *RestoreAbortInfo { + return lastRestoreAbortInfo +} + +// ClearRestoreAbortInfo clears the stored abort info. +func ClearRestoreAbortInfo() { + lastRestoreAbortInfo = nil +} + func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging.Logger, version string) (err error) { if cfg == nil { return fmt.Errorf("configuration not available") @@ -232,7 +254,9 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging } clusterServicesStopped = true defer func() { - if err := startPVEClusterServices(ctx, logger); err != nil { + restartCtx, cancel := context.WithTimeout(context.Background(), 2*serviceStartTimeout+2*serviceVerifyTimeout+10*time.Second) + defer cancel() + if err := startPVEClusterServices(restartCtx, logger); err != nil { logger.Warning("Failed to restart PVE services after restore: %v", err) } }() @@ -254,7 +278,9 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging } else { pbsServicesStopped = true defer func() { - if err := startPBSServices(ctx, logger); err != nil { + restartCtx, cancel := context.WithTimeout(context.Background(), 2*serviceStartTimeout+2*serviceVerifyTimeout+10*time.Second) + defer cancel() + if err := startPBSServices(restartCtx, logger); err != nil { logger.Warning("Failed to restart PBS services after restore: %v", err) } }() @@ -353,6 +379,9 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging } if exportLog, err := extractSelectiveArchive(ctx, prepared.ArchivePath, exportRoot, plan.ExportCategories, RestoreModeCustom, logger); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Export completed with errors: %v", err) } else { exportLogPath = exportLog @@ -364,6 +393,9 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging if exportRoot == "" { logger.Warning("Cluster SAFE mode selected but export directory not available; skipping automatic pvesh apply") } else if err := runSafeClusterApply(ctx, reader, exportRoot, logger); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Cluster SAFE apply completed with errors: %v", err) } } @@ -380,6 +412,9 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging } if stageLog, err := extractSelectiveArchive(ctx, prepared.ArchivePath, stageRoot, plan.StagedCategories, RestoreModeCustom, logger); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Staging completed with errors: %v", err) } else { stageLogPath = stageLog @@ -387,12 +422,18 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging logger.Info("") if err := maybeApplyPBSConfigsFromStage(ctx, logger, plan, stageRoot, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("PBS staged config apply: %v", err) } } stageRootForNetworkApply := stageRoot if installed, err := maybeInstallNetworkConfigFromStage(ctx, logger, plan, stageRoot, prepared.ArchivePath, networkRollbackBackup, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Network staged install: %v", err) } else if installed { stageRootForNetworkApply = "" @@ -428,12 +469,20 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging } else { fsCategory := []Category{*fsCat} if _, err := extractSelectiveArchive(ctx, prepared.ArchivePath, fsTempDir, fsCategory, RestoreModeCustom, logger); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Failed to extract filesystem config for merge: %v", err) } else { // Perform Smart Merge currentFstab := filepath.Join(destRoot, "etc", "fstab") backupFstab := filepath.Join(fsTempDir, "etc", "fstab") if err := SmartMergeFstab(ctx, logger, reader, currentFstab, backupFstab, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + logger.Info("Restore aborted by user during Smart Filesystem Configuration Merge.") + return err + } + restoreHadWarnings = true logger.Warning("Smart Fstab Merge failed: %v", err) } } @@ -445,6 +494,9 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging if plan.HasCategoryID("network") { logger.Info("") if err := maybeRepairResolvConfAfterRestore(ctx, logger, prepared.ArchivePath, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } restoreHadWarnings = true logger.Warning("DNS resolver repair: %v", err) } @@ -452,18 +504,37 @@ func RunRestoreWorkflow(ctx context.Context, cfg *config.Config, logger *logging logger.Info("") if err := maybeApplyNetworkConfigCLI(ctx, reader, logger, plan, safetyBackup, networkRollbackBackup, stageRootForNetworkApply, prepared.ArchivePath, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + logger.Info("Restore aborted by user during network apply prompt.") + return err + } restoreHadWarnings = true if errors.Is(err, ErrNetworkApplyNotCommitted) { var notCommitted *NetworkApplyNotCommittedError - restoredIP := "unknown" + observedIP := "unknown" rollbackLog := "" + rollbackArmed := false if errors.As(err, ¬Committed) && notCommitted != nil { if strings.TrimSpace(notCommitted.RestoredIP) != "" { - restoredIP = strings.TrimSpace(notCommitted.RestoredIP) + observedIP = strings.TrimSpace(notCommitted.RestoredIP) } rollbackLog = strings.TrimSpace(notCommitted.RollbackLog) + rollbackArmed = notCommitted.RollbackArmed + // Save abort info for footer display + lastRestoreAbortInfo = &RestoreAbortInfo{ + NetworkRollbackArmed: rollbackArmed, + NetworkRollbackLog: rollbackLog, + NetworkRollbackMarker: strings.TrimSpace(notCommitted.RollbackMarker), + OriginalIP: notCommitted.OriginalIP, + CurrentIP: observedIP, + RollbackDeadline: notCommitted.RollbackDeadline, + } + } + if rollbackArmed { + logger.Warning("Network apply not committed; rollback is ARMED and will run automatically. Current IP: %s", observedIP) + } else { + logger.Warning("Network apply not committed; rollback has executed (or marker cleared). Current IP: %s", observedIP) } - logger.Warning("Network apply not committed and original settings restored. IP: %s", restoredIP) if rollbackLog != "" { logger.Info("Rollback log: %s", rollbackLog) } @@ -1182,6 +1253,10 @@ func runFullRestore(ctx context.Context, reader *bufio.Reader, candidate *decryp currentFstab := filepath.Join(destRoot, "etc", "fstab") backupFstab := filepath.Join(fsTempDir, "etc", "fstab") if err := SmartMergeFstab(ctx, logger, reader, currentFstab, backupFstab, dryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + logger.Info("Restore aborted by user during Smart Filesystem Configuration Merge.") + return err + } logger.Warning("Smart Fstab Merge failed: %v", err) } } diff --git a/internal/orchestrator/restore_filesystem.go b/internal/orchestrator/restore_filesystem.go index 8d7f04c..020a8c6 100644 --- a/internal/orchestrator/restore_filesystem.go +++ b/internal/orchestrator/restore_filesystem.go @@ -8,8 +8,8 @@ import ( "os" "path/filepath" "strings" + "time" - "github.com/tis24dev/proxsave/internal/input" "github.com/tis24dev/proxsave/internal/logging" ) @@ -67,8 +67,8 @@ func SmartMergeFstab(ctx context.Context, logger *logging.Logger, reader *bufio. } defaultYes := analysis.RootComparable && analysis.RootMatch && (!analysis.SwapComparable || analysis.SwapMatch) - confirmMsg := "Vuoi aggiungere i mount mancanti (NFS/CIFS e dati su UUID/LABEL verificati)?" - confirmed, err := confirmLocal(ctx, reader, confirmMsg, defaultYes) + confirmMsg := "Do you want to add the missing mounts (NFS/CIFS and data mounts with verified UUID/LABEL)?" + confirmed, err := promptYesNoWithCountdown(ctx, reader, logger, confirmMsg, 90*time.Second, defaultYes) if err != nil { return err } @@ -82,26 +82,6 @@ func SmartMergeFstab(ctx context.Context, logger *logging.Logger, reader *bufio. return applyFstabMerge(ctx, logger, currentRaw, currentFstabPath, analysis.ProposedMounts, dryRun) } -// confirmLocal prompts for yes/no -func confirmLocal(ctx context.Context, reader *bufio.Reader, prompt string, defaultYes bool) (bool, error) { - defStr := "[Y/n]" - if !defaultYes { - defStr = "[y/N]" - } - fmt.Printf("%s %s ", prompt, defStr) - - line, err := input.ReadLineWithContext(ctx, reader) - if err != nil { - return false, err - } - - trimmed := strings.TrimSpace(strings.ToLower(line)) - if trimmed == "" { - return defaultYes, nil - } - return trimmed == "y" || trimmed == "yes", nil -} - func parseFstab(path string) ([]FstabEntry, []string, error) { content, err := restoreFS.ReadFile(path) if err != nil { @@ -304,13 +284,13 @@ func isSafeMountCandidate(e FstabEntry) bool { func printFstabAnalysis(logger *logging.Logger, res FstabAnalysisResult) { fmt.Println() - logger.Info("Analisi fstab:") + logger.Info("fstab analysis:") // Root Status if !res.RootComparable { - logger.Warning("! Root filesystem: non determinabile (entry mancante in current/backup fstab)") + logger.Warning("! Root filesystem: undetermined (missing entry in current/backup fstab)") } else if res.RootMatch { - logger.Info("✓ Root filesystem: compatibile (UUID kept from system)") + logger.Info("✓ Root filesystem: compatible (UUID kept from system)") } else { // ANSI Yellow/Red might be nice, but stick to standard logger for now. logger.Warning("! Root UUID mismatch: Backup is from a different machine (System info preserved)") @@ -319,9 +299,9 @@ func printFstabAnalysis(logger *logging.Logger, res FstabAnalysisResult) { // Swap Status if !res.SwapComparable { - logger.Info("Swap: non determinabile (entry mancante in current/backup fstab)") + logger.Info("Swap: undetermined (missing entry in current/backup fstab)") } else if res.SwapMatch { - logger.Info("✓ Swap: compatibile") + logger.Info("✓ Swap: compatible") } else { logger.Warning("! Swap mismatch: keeping current swap configuration") logger.Debug(" Details: Current=%s, Backup=%s", res.SwapDeviceCurrent, res.SwapDeviceBackup) @@ -329,20 +309,20 @@ func printFstabAnalysis(logger *logging.Logger, res FstabAnalysisResult) { // New Entries if len(res.ProposedMounts) > 0 { - logger.Info("+ %d mount(s) sicuri trovati nel backup ma non nel sistema attuale:", len(res.ProposedMounts)) + logger.Info("+ %d safe mount(s) found in the backup but missing from the current system:", len(res.ProposedMounts)) for _, m := range res.ProposedMounts { logger.Info(" %s -> %s (%s)", m.Device, m.MountPoint, m.Type) } } else { - logger.Info("✓ Nessun mount aggiuntivo trovato nel backup.") + logger.Info("✓ No additional mounts found in the backup.") } if len(res.SkippedMounts) > 0 { - logger.Warning("! %d mount(s) trovati ma NON proposti automaticamente (potenzialmente rischiosi):", len(res.SkippedMounts)) + logger.Warning("! %d mount(s) found but NOT auto-proposed (potentially risky):", len(res.SkippedMounts)) for _, m := range res.SkippedMounts { logger.Warning(" %s -> %s (%s)", m.Device, m.MountPoint, m.Type) } - logger.Info(" Suggerimento: verifica dischi/UUID e opzioni (nofail/_netdev) prima di aggiungerli a /etc/fstab.") + logger.Info(" Hint: verify disks/UUIDs and options (nofail/_netdev) before adding them to /etc/fstab.") } fmt.Println() } diff --git a/internal/orchestrator/restore_tui.go b/internal/orchestrator/restore_tui.go index 46a877a..898e812 100644 --- a/internal/orchestrator/restore_tui.go +++ b/internal/orchestrator/restore_tui.go @@ -9,12 +9,14 @@ import ( "path/filepath" "sort" "strings" + "sync/atomic" "time" "github.com/gdamore/tcell/v2" "github.com/rivo/tview" "github.com/tis24dev/proxsave/internal/config" + "github.com/tis24dev/proxsave/internal/input" "github.com/tis24dev/proxsave/internal/logging" "github.com/tis24dev/proxsave/internal/tui" "github.com/tis24dev/proxsave/internal/tui/components" @@ -243,7 +245,9 @@ func RunRestoreWorkflowTUI(ctx context.Context, cfg *config.Config, logger *logg } clusterServicesStopped = true defer func() { - if err := startPVEClusterServices(ctx, logger); err != nil { + restartCtx, cancel := context.WithTimeout(context.Background(), 2*serviceStartTimeout+2*serviceVerifyTimeout+10*time.Second) + defer cancel() + if err := startPVEClusterServices(restartCtx, logger); err != nil { logger.Warning("Failed to restart PVE services after restore: %v", err) } }() @@ -270,7 +274,9 @@ func RunRestoreWorkflowTUI(ctx context.Context, cfg *config.Config, logger *logg } else { pbsServicesStopped = true defer func() { - if err := startPBSServices(ctx, logger); err != nil { + restartCtx, cancel := context.WithTimeout(context.Background(), 2*serviceStartTimeout+2*serviceVerifyTimeout+10*time.Second) + defer cancel() + if err := startPBSServices(restartCtx, logger); err != nil { logger.Warning("Failed to restart PBS services after restore: %v", err) } }() @@ -353,6 +359,9 @@ func RunRestoreWorkflowTUI(ctx context.Context, cfg *config.Config, logger *logg } if exportLog, exErr := extractSelectiveArchive(ctx, prepared.ArchivePath, exportRoot, plan.ExportCategories, RestoreModeCustom, logger); exErr != nil { + if errors.Is(exErr, ErrRestoreAborted) || input.IsAborted(exErr) { + return exErr + } logger.Warning("Export completed with errors: %v", exErr) } else { exportLogPath = exportLog @@ -365,6 +374,9 @@ func RunRestoreWorkflowTUI(ctx context.Context, cfg *config.Config, logger *logg logger.Warning("Cluster SAFE mode selected but export directory not available; skipping automatic pvesh apply") } else if err := runSafeClusterApply(ctx, bufio.NewReader(os.Stdin), exportRoot, logger); err != nil { // Note: runSafeClusterApply currently uses console prompts; this step remains non-TUI. + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Cluster SAFE apply completed with errors: %v", err) } } @@ -381,6 +393,9 @@ func RunRestoreWorkflowTUI(ctx context.Context, cfg *config.Config, logger *logg } if stageLog, err := extractSelectiveArchive(ctx, prepared.ArchivePath, stageRoot, plan.StagedCategories, RestoreModeCustom, logger); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Staging completed with errors: %v", err) } else { stageLogPath = stageLog @@ -388,12 +403,18 @@ func RunRestoreWorkflowTUI(ctx context.Context, cfg *config.Config, logger *logg logger.Info("") if err := maybeApplyPBSConfigsFromStage(ctx, logger, plan, stageRoot, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("PBS staged config apply: %v", err) } } stageRootForNetworkApply := stageRoot if installed, err := maybeInstallNetworkConfigFromStage(ctx, logger, plan, stageRoot, prepared.ArchivePath, networkRollbackBackup, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Network staged install: %v", err) } else if installed { stageRootForNetworkApply = "" @@ -417,12 +438,18 @@ func RunRestoreWorkflowTUI(ctx context.Context, cfg *config.Config, logger *logg if plan.HasCategoryID("network") { logger.Info("") if err := maybeRepairResolvConfAfterRestore(ctx, logger, prepared.ArchivePath, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("DNS resolver repair: %v", err) } } logger.Info("") if err := maybeApplyNetworkConfigTUI(ctx, logger, plan, safetyBackup, networkRollbackBackup, stageRootForNetworkApply, prepared.ArchivePath, configPath, buildSig, cfg.DryRun); err != nil { + if errors.Is(err, ErrRestoreAborted) || input.IsAborted(err) { + return err + } logger.Warning("Network apply step skipped or failed: %v", err) } @@ -519,6 +546,7 @@ func runRestoreSelectionWizard(ctx context.Context, cfg *config.Config, logger * selection := &restoreSelection{} var selectionErr error var scan scanController + var scanSeq uint64 pathList := tview.NewList().ShowSecondaryText(false) pathList.SetMainTextColor(tcell.ColorWhite). @@ -536,6 +564,7 @@ func runRestoreSelectionWizard(ctx context.Context, cfg *config.Config, logger * return } selectedOption := options[index] + scanID := atomic.AddUint64(&scanSeq, 1) logging.DebugStep(logger, "restore selection wizard", "selected source label=%q path=%q rclone=%v", selectedOption.Label, selectedOption.Path, selectedOption.IsRclone) pages.SwitchToPage("paths-loading") go func() { @@ -544,11 +573,18 @@ func runRestoreSelectionWizard(ctx context.Context, cfg *config.Config, logger * var candidates []*decryptCandidate var scanErr error - scanDone := logging.DebugStart(logger, "scan backup source", "path=%s rclone=%v", selectedOption.Path, selectedOption.IsRclone) + scanDone := logging.DebugStart(logger, "scan backup source", "id=%d path=%s rclone=%v", scanID, selectedOption.Path, selectedOption.IsRclone) defer func() { scanDone(scanErr) }() if selectedOption.IsRclone { - candidates, scanErr = discoverRcloneBackups(scanCtx, selectedOption.Path, logger) + timeout := 30 * time.Second + if cfg != nil && cfg.RcloneTimeoutConnection > 0 { + timeout = time.Duration(cfg.RcloneTimeoutConnection) * time.Second + } + logging.DebugStep(logger, "scan backup source", "id=%d rclone_timeout=%s", scanID, timeout) + rcloneCtx, cancel := context.WithTimeout(scanCtx, timeout) + defer cancel() + candidates, scanErr = discoverRcloneBackups(rcloneCtx, selectedOption.Path, logger) } else { candidates, scanErr = discoverBackupCandidates(logger, selectedOption.Path) } @@ -560,6 +596,9 @@ func runRestoreSelectionWizard(ctx context.Context, cfg *config.Config, logger * app.QueueUpdateDraw(func() { if scanErr != nil { message := fmt.Sprintf("Failed to inspect %s: %v", selectedOption.Path, scanErr) + if selectedOption.IsRclone && errors.Is(scanErr, context.DeadlineExceeded) { + message = fmt.Sprintf("Timed out while scanning %s (rclone). Check connectivity/rclone config or increase RCLONE_TIMEOUT_CONNECTION. (%v)", selectedOption.Path, scanErr) + } showRestoreErrorModal(app, pages, configPath, buildSig, message, func() { pages.SwitchToPage("paths") }) @@ -1120,17 +1159,25 @@ func maybeApplyNetworkConfigTUI(ctx context.Context, logger *logging.Logger, pla } logging.DebugStep(logger, "network safe apply (tui)", "Prompt: apply network now with rollback timer") + sourceLine := "Source: /etc/network (will be applied)" + if strings.TrimSpace(stageRoot) != "" { + sourceLine = fmt.Sprintf("Source: %s (will be copied to /etc and applied)", strings.TrimSpace(stageRoot)) + } message := fmt.Sprintf( - "Apply restored network configuration now with an automatic rollback timer (%ds).\n\nIf you do not commit the changes, the previous network configuration will be restored automatically.\n\nProceed with live network apply?", + "Network restore: a restored network configuration is ready to apply.\n%s\n\nThis will reload networking immediately (no reboot).\n\nWARNING: This may change the active IP and disconnect SSH/Web sessions.\n\nAfter applying, type COMMIT within %ds or ProxSave will roll back automatically.\n\nRecommendation: run this step from the local console/IPMI, not over SSH.\n\nApply network configuration now?", + sourceLine, int(defaultNetworkRollbackTimeout.Seconds()), ) - applyNow, err := promptYesNoTUIFunc( + applyNow, err := promptYesNoTUIWithCountdown( + ctx, + logger, "Apply network configuration", configPath, buildSig, message, "Apply now", "Skip apply", + 90*time.Second, ) if err != nil { return err @@ -1896,26 +1943,26 @@ func runFullRestoreTUI(ctx context.Context, candidate *decryptCandidate, prepare logger.Info("No new safe mounts found to restore. Keeping current fstab.") } else { var msg strings.Builder - msg.WriteString("ProxSave ha trovato mount mancanti in /etc/fstab.\n\n") + msg.WriteString("ProxSave found missing mounts in /etc/fstab.\n\n") if analysis.RootComparable && !analysis.RootMatch { - msg.WriteString("⚠ Root UUID mismatch: il backup sembra provenire da una macchina diversa.\n") + msg.WriteString("⚠ Root UUID mismatch: the backup appears to come from a different machine.\n") } if analysis.SwapComparable && !analysis.SwapMatch { - msg.WriteString("⚠ Swap mismatch: verrà mantenuta la configurazione swap attuale.\n") + msg.WriteString("⚠ Swap mismatch: the current swap configuration will be kept.\n") } - msg.WriteString("\nMount proposti (sicuri):\n") + msg.WriteString("\nProposed mounts (safe):\n") for _, m := range analysis.ProposedMounts { fmt.Fprintf(&msg, " - %s -> %s (%s)\n", m.Device, m.MountPoint, m.Type) } if len(analysis.SkippedMounts) > 0 { - msg.WriteString("\nMount trovati ma non proposti automaticamente:\n") + msg.WriteString("\nMounts found but not auto-proposed:\n") for _, m := range analysis.SkippedMounts { fmt.Fprintf(&msg, " - %s -> %s (%s)\n", m.Device, m.MountPoint, m.Type) } - msg.WriteString("\nSuggerimento: verifica dischi/UUID e opzioni (nofail/_netdev) prima di aggiungerli.\n") + msg.WriteString("\nHint: verify disks/UUIDs and options (nofail/_netdev) before adding them.\n") } - apply, perr := promptYesNoTUIFunc("Smart fstab merge", configPath, buildSig, msg.String(), "Apply", "Skip") + apply, perr := promptYesNoTUIWithCountdown(ctx, logger, "Smart fstab merge", configPath, buildSig, msg.String(), "Apply", "Skip", 90*time.Second) if perr != nil { return perr } @@ -1976,6 +2023,96 @@ func promptYesNoTUI(title, configPath, buildSig, message, yesLabel, noLabel stri return result, nil } +func promptYesNoTUIWithCountdown(ctx context.Context, logger *logging.Logger, title, configPath, buildSig, message, yesLabel, noLabel string, timeout time.Duration) (bool, error) { + app := newTUIApp() + var result bool + var cancelled bool + var timedOut bool + + infoText := tview.NewTextView(). + SetText(message). + SetWrap(true). + SetTextColor(tcell.ColorWhite). + SetDynamicColors(true) + + countdownText := tview.NewTextView(). + SetWrap(false). + SetTextColor(tcell.ColorYellow). + SetDynamicColors(true) + + deadline := time.Now().Add(timeout) + updateCountdown := func() { + left := time.Until(deadline) + if left < 0 { + left = 0 + } + countdownText.SetText(fmt.Sprintf("Auto-skip in %ds (default: %s)", int(left.Seconds()), noLabel)) + } + updateCountdown() + + form := components.NewForm(app) + form.SetOnSubmit(func(values map[string]string) error { + result = true + return nil + }) + form.SetOnCancel(func() { + cancelled = true + }) + form.AddSubmitButton(yesLabel) + form.AddCancelButton(noLabel) + enableFormNavigation(form, nil) + + content := tview.NewFlex(). + SetDirection(tview.FlexRow). + AddItem(infoText, 0, 1, false). + AddItem(countdownText, 1, 0, false). + AddItem(form.Form, 3, 0, true) + + page := buildRestoreWizardPage(title, configPath, buildSig, content) + form.SetParentView(page) + + stopCh := make(chan struct{}) + defer close(stopCh) + + if timeout > 0 { + go func() { + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + for { + select { + case <-stopCh: + return + case <-ctx.Done(): + cancelled = true + app.Stop() + return + case <-ticker.C: + left := time.Until(deadline) + if left <= 0 { + timedOut = true + cancelled = true + app.Stop() + return + } + app.QueueUpdateDraw(func() { updateCountdown() }) + } + } + }() + } + + if err := app.SetRoot(page, true).SetFocus(form.Form).Run(); err != nil { + return false, err + } + if timedOut { + logging.DebugStep(logger, "prompt yes/no (tui)", "Timeout expired (%s): proceeding with No", timeout) + return false, nil + } + if cancelled { + return false, nil + } + return result, nil +} + func promptOkTUI(title, configPath, buildSig, message, okLabel string) error { app := newTUIApp() diff --git a/internal/orchestrator/restore_workflow_abort_test.go b/internal/orchestrator/restore_workflow_abort_test.go new file mode 100644 index 0000000..329daa1 --- /dev/null +++ b/internal/orchestrator/restore_workflow_abort_test.go @@ -0,0 +1,143 @@ +package orchestrator + +import ( + "bufio" + "context" + "os" + "path/filepath" + "testing" + "time" + + "github.com/tis24dev/proxsave/internal/backup" + "github.com/tis24dev/proxsave/internal/config" + "github.com/tis24dev/proxsave/internal/logging" + "github.com/tis24dev/proxsave/internal/types" +) + +func TestRunRestoreWorkflow_FstabPromptInputAborted_AbortsWorkflow(t *testing.T) { + origRestoreFS := restoreFS + origRestoreCmd := restoreCmd + origRestorePrompter := restorePrompter + origRestoreSystem := restoreSystem + origRestoreTime := restoreTime + origCompatFS := compatFS + origPrepare := prepareDecryptedBackupFunc + origSafetyFS := safetyFS + origSafetyNow := safetyNow + t.Cleanup(func() { + restoreFS = origRestoreFS + restoreCmd = origRestoreCmd + restorePrompter = origRestorePrompter + restoreSystem = origRestoreSystem + restoreTime = origRestoreTime + compatFS = origCompatFS + prepareDecryptedBackupFunc = origPrepare + safetyFS = origSafetyFS + safetyNow = origSafetyNow + }) + + fakeFS := NewFakeFS() + t.Cleanup(func() { _ = os.RemoveAll(fakeFS.Root) }) + restoreFS = fakeFS + compatFS = fakeFS + safetyFS = fakeFS + + fakeNow := &FakeTime{Current: time.Date(2020, 1, 2, 3, 4, 5, 0, time.UTC)} + restoreTime = fakeNow + safetyNow = fakeNow.Now + + restoreSystem = fakeSystemDetector{systemType: SystemTypePVE} + restoreCmd = runOnlyRunner{} + + // Make compatibility detection treat this as PVE. + if err := fakeFS.AddFile("/usr/bin/qm", []byte("x")); err != nil { + t.Fatalf("fakeFS.AddFile: %v", err) + } + + currentFstab := "UUID=root / ext4 defaults 0 1\nUUID=swap none swap sw 0 0\n" + if err := fakeFS.WriteFile("/etc/fstab", []byte(currentFstab), 0o640); err != nil { + t.Fatalf("fakeFS.WriteFile(/etc/fstab): %v", err) + } + + backupFstab := "UUID=root / ext4 defaults 0 1\nUUID=swap none swap sw 0 0\n192.168.1.246:/volume1/ProxmoxNFS /mnt/Synology_NFS nfs defaults 0 0\n" + tmpTar := filepath.Join(t.TempDir(), "bundle.tar") + if err := writeTarFile(tmpTar, map[string]string{ + "etc/fstab": backupFstab, + }); err != nil { + t.Fatalf("writeTarFile: %v", err) + } + tarBytes, err := os.ReadFile(tmpTar) + if err != nil { + t.Fatalf("os.ReadFile: %v", err) + } + if err := fakeFS.WriteFile("/bundle.tar", tarBytes, 0o640); err != nil { + t.Fatalf("fakeFS.WriteFile(/bundle.tar): %v", err) + } + + restorePrompter = fakeRestorePrompter{ + mode: RestoreModeCustom, + categories: []Category{mustCategoryByID(t, "filesystem")}, + confirmed: true, + } + + prepareDecryptedBackupFunc = func(ctx context.Context, reader *bufio.Reader, cfg *config.Config, logger *logging.Logger, version string, requireEncrypted bool) (*decryptCandidate, *preparedBundle, error) { + cand := &decryptCandidate{ + DisplayBase: "test", + Manifest: &backup.Manifest{ + CreatedAt: fakeNow.Now(), + ClusterMode: "standalone", + ProxmoxType: "pve", + ScriptVersion: "vtest", + }, + } + prepared := &preparedBundle{ + ArchivePath: "/bundle.tar", + Manifest: backup.Manifest{ArchivePath: "/bundle.tar"}, + cleanup: func() {}, + } + return cand, prepared, nil + } + + // Simulate Ctrl+C behavior: stdin closed -> input.ErrInputAborted during the fstab prompt. + oldIn := os.Stdin + oldOut := os.Stdout + oldErr := os.Stderr + t.Cleanup(func() { + os.Stdin = oldIn + os.Stdout = oldOut + os.Stderr = oldErr + }) + inR, inW, err := os.Pipe() + if err != nil { + t.Fatalf("os.Pipe: %v", err) + } + out, err := os.OpenFile(os.DevNull, os.O_WRONLY, 0o666) + if err != nil { + _ = inR.Close() + _ = inW.Close() + t.Fatalf("OpenFile(%s): %v", os.DevNull, err) + } + errOut, err := os.OpenFile(os.DevNull, os.O_WRONLY, 0o666) + if err != nil { + _ = inR.Close() + _ = inW.Close() + _ = out.Close() + t.Fatalf("OpenFile(%s): %v", os.DevNull, err) + } + os.Stdin = inR + os.Stdout = out + os.Stderr = errOut + t.Cleanup(func() { + _ = inR.Close() + _ = out.Close() + _ = errOut.Close() + }) + _ = inW.Close() + + logger := logging.New(types.LogLevelError, false) + cfg := &config.Config{BaseDir: "/base"} + + if err := RunRestoreWorkflow(context.Background(), cfg, logger, "vtest"); err != ErrRestoreAborted { + t.Fatalf("err=%v; want %v", err, ErrRestoreAborted) + } +} diff --git a/internal/orchestrator/restore_workflow_warnings_test.go b/internal/orchestrator/restore_workflow_warnings_test.go new file mode 100644 index 0000000..b7aeb1e --- /dev/null +++ b/internal/orchestrator/restore_workflow_warnings_test.go @@ -0,0 +1,174 @@ +package orchestrator + +import ( + "bufio" + "context" + "errors" + "io/fs" + "os" + "path/filepath" + "testing" + "time" + + "github.com/tis24dev/proxsave/internal/backup" + "github.com/tis24dev/proxsave/internal/config" + "github.com/tis24dev/proxsave/internal/logging" + "github.com/tis24dev/proxsave/internal/types" +) + +type failWritePathFS struct { + FS + failPath string + failErr error +} + +func (f failWritePathFS) WriteFile(path string, data []byte, perm fs.FileMode) error { + if filepath.Clean(path) == filepath.Clean(f.failPath) { + if f.failErr != nil { + return f.failErr + } + return errors.New("injected write failure") + } + return f.FS.WriteFile(path, data, perm) +} + +func TestRunRestoreWorkflow_FstabMergeFails_ContinuesWithWarnings(t *testing.T) { + origRestoreFS := restoreFS + origRestoreCmd := restoreCmd + origRestorePrompter := restorePrompter + origRestoreSystem := restoreSystem + origRestoreTime := restoreTime + origCompatFS := compatFS + origPrepare := prepareDecryptedBackupFunc + origSafetyFS := safetyFS + origSafetyNow := safetyNow + t.Cleanup(func() { + restoreFS = origRestoreFS + restoreCmd = origRestoreCmd + restorePrompter = origRestorePrompter + restoreSystem = origRestoreSystem + restoreTime = origRestoreTime + compatFS = origCompatFS + prepareDecryptedBackupFunc = origPrepare + safetyFS = origSafetyFS + safetyNow = origSafetyNow + }) + + fakeFS := NewFakeFS() + t.Cleanup(func() { _ = os.RemoveAll(fakeFS.Root) }) + restoreFS = fakeFS + compatFS = fakeFS + safetyFS = fakeFS + + fakeNow := &FakeTime{Current: time.Date(2020, 1, 2, 3, 4, 5, 0, time.UTC)} + restoreTime = fakeNow + safetyNow = fakeNow.Now + + restoreSystem = fakeSystemDetector{systemType: SystemTypePVE} + restoreCmd = runOnlyRunner{} + + // Make compatibility detection treat this as PVE. + if err := fakeFS.AddFile("/usr/bin/qm", []byte("x")); err != nil { + t.Fatalf("fakeFS.AddFile: %v", err) + } + + currentFstab := "UUID=root / ext4 defaults 0 1\nUUID=swap none swap sw 0 0\n" + if err := fakeFS.WriteFile("/etc/fstab", []byte(currentFstab), 0o640); err != nil { + t.Fatalf("fakeFS.WriteFile(/etc/fstab): %v", err) + } + + backupFstab := "UUID=root / ext4 defaults 0 1\nUUID=swap none swap sw 0 0\n192.168.1.246:/volume1/ProxmoxNFS /mnt/Synology_NFS nfs defaults 0 0\n" + tmpTar := filepath.Join(t.TempDir(), "bundle.tar") + if err := writeTarFile(tmpTar, map[string]string{ + "etc/fstab": backupFstab, + }); err != nil { + t.Fatalf("writeTarFile: %v", err) + } + tarBytes, err := os.ReadFile(tmpTar) + if err != nil { + t.Fatalf("os.ReadFile: %v", err) + } + if err := fakeFS.WriteFile("/bundle.tar", tarBytes, 0o640); err != nil { + t.Fatalf("fakeFS.WriteFile(/bundle.tar): %v", err) + } + + restorePrompter = fakeRestorePrompter{ + mode: RestoreModeCustom, + categories: []Category{mustCategoryByID(t, "filesystem")}, + confirmed: true, + } + + prepareDecryptedBackupFunc = func(ctx context.Context, reader *bufio.Reader, cfg *config.Config, logger *logging.Logger, version string, requireEncrypted bool) (*decryptCandidate, *preparedBundle, error) { + cand := &decryptCandidate{ + DisplayBase: "test", + Manifest: &backup.Manifest{ + CreatedAt: fakeNow.Now(), + ClusterMode: "standalone", + ProxmoxType: "pve", + ScriptVersion: "vtest", + }, + } + prepared := &preparedBundle{ + ArchivePath: "/bundle.tar", + Manifest: backup.Manifest{ArchivePath: "/bundle.tar"}, + cleanup: func() {}, + } + return cand, prepared, nil + } + + // Inject a non-abort error during applyFstabMerge backup write. + restoreFS = failWritePathFS{ + FS: fakeFS, + failPath: "/etc/fstab.bak-20200102-030405", + failErr: errors.New("disk full"), + } + + // Provide input to accept defaultYes (blank line). + oldIn := os.Stdin + oldOut := os.Stdout + oldErr := os.Stderr + t.Cleanup(func() { + os.Stdin = oldIn + os.Stdout = oldOut + os.Stderr = oldErr + }) + inR, inW, err := os.Pipe() + if err != nil { + t.Fatalf("os.Pipe: %v", err) + } + out, err := os.OpenFile(os.DevNull, os.O_WRONLY, 0o666) + if err != nil { + _ = inR.Close() + _ = inW.Close() + t.Fatalf("OpenFile(%s): %v", os.DevNull, err) + } + errOut, err := os.OpenFile(os.DevNull, os.O_WRONLY, 0o666) + if err != nil { + _ = inR.Close() + _ = inW.Close() + _ = out.Close() + t.Fatalf("OpenFile(%s): %v", os.DevNull, err) + } + os.Stdin = inR + os.Stdout = out + os.Stderr = errOut + t.Cleanup(func() { + _ = inR.Close() + _ = out.Close() + _ = errOut.Close() + }) + if _, err := inW.WriteString("\n"); err != nil { + t.Fatalf("WriteString: %v", err) + } + _ = inW.Close() + + logger := logging.New(types.LogLevelWarning, false) + cfg := &config.Config{BaseDir: "/base"} + + if err := RunRestoreWorkflow(context.Background(), cfg, logger, "vtest"); err != nil { + t.Fatalf("RunRestoreWorkflow error: %v", err) + } + if !logger.HasWarnings() { + t.Fatalf("expected warnings") + } +} diff --git a/internal/orchestrator/storage_adapter.go b/internal/orchestrator/storage_adapter.go index ab1dedf..6e473bf 100644 --- a/internal/orchestrator/storage_adapter.go +++ b/internal/orchestrator/storage_adapter.go @@ -184,8 +184,8 @@ func (s *StorageAdapter) logCurrentBackupCount() { return } - // Il backend cloud logga già il conteggio corrente durante ApplyRetention - // riutilizzando la stessa lista; evitiamo una seconda chiamata rclone lsl. + // The cloud backend already logs the current count during ApplyRetention + // by reusing the same list; avoid a second rclone lsl call. if s.backend.Location() == storage.LocationCloud { return } diff --git a/internal/storage/cloud.go b/internal/storage/cloud.go index 1b5036e..e7cd189 100644 --- a/internal/storage/cloud.go +++ b/internal/storage/cloud.go @@ -1271,6 +1271,10 @@ func (c *CloudStorage) deleteAssociatedLog(ctx context.Context, backupFile strin if base == "" { return false } + baseForWarning := c.cloudLogBase(base) + if baseForWarning == "" { + baseForWarning = base + } host, ts, ok := extractLogKeyFromBackup(backupFile) if !ok { @@ -1296,7 +1300,7 @@ func (c *CloudStorage) deleteAssociatedLog(ctx context.Context, backupFile strin msg := strings.TrimSpace(string(output)) if isRcloneObjectNotFound(msg) || isRcloneObjectNotFound(err.Error()) { if strings.Contains(strings.ToLower(msg), "directory") { - c.markCloudLogPathMissing(base, msg) + c.markCloudLogPathMissing(baseForWarning, msg) } else { c.logger.Debug("Cloud logs: log already removed %s (%s)", cloudPath, msg) } @@ -1317,7 +1321,7 @@ func (c *CloudStorage) countLogFiles(ctx context.Context) int { if c == nil || c.config == nil { return -1 } - base := strings.TrimSpace(c.config.CloudLogPath) + base := c.cloudLogBase(c.config.CloudLogPath) if base == "" { return 0 } @@ -1357,6 +1361,18 @@ func (c *CloudStorage) countLogFiles(ctx context.Context) int { return count } +func (c *CloudStorage) cloudLogBase(basePath string) string { + base := strings.TrimSpace(basePath) + if base == "" { + return "" + } + if !strings.Contains(base, ":") && c != nil && strings.TrimSpace(c.remote) != "" { + base = strings.TrimSpace(c.remote) + ":" + base + } + base = strings.TrimRight(base, "/") + return base +} + // cloudLogPath builds the full cloud log path, mirroring buildCloudLogDestination logic. // Supports both new style (/path) and legacy style (remote:/path). // If basePath doesn't contain ":", uses c.remote as the remote name. diff --git a/internal/storage/cloud_test.go b/internal/storage/cloud_test.go index 2a394f1..8e091c4 100644 --- a/internal/storage/cloud_test.go +++ b/internal/storage/cloud_test.go @@ -575,6 +575,67 @@ func TestCloudStorageSkipsCloudLogsWhenPathMissing(t *testing.T) { } } +func TestCloudStorageCountLogFiles_NormalizesPathOnlyCloudLogPath(t *testing.T) { + cfg := &config.Config{ + CloudEnabled: true, + CloudRemote: "remote:base-prefix", + CloudLogPath: "/logs/", + RcloneRetries: 1, + } + cs := newCloudStorageForTest(cfg) + queue := &commandQueue{ + t: t, + queue: []queuedResponse{ + { + name: "rclone", + args: []string{"lsf", "remote:/logs", "--files-only"}, + out: "backup-host-20250101-010101.log\n", + }, + }, + } + cs.execCommand = queue.exec + + if got := cs.countLogFiles(context.Background()); got != 1 { + t.Fatalf("countLogFiles() = %d; want 1", got) + } +} + +func TestCloudStorageSkipsCloudLogsWhenPathOnlyLogDirMissing(t *testing.T) { + cfg := &config.Config{ + CloudEnabled: true, + CloudRemote: "remote", + CloudLogPath: "/logs", + RcloneRetries: 1, + } + cs := newCloudStorageForTest(cfg) + queue := &commandQueue{ + t: t, + queue: []queuedResponse{ + { + name: "rclone", + args: []string{"lsf", "remote:/logs", "--files-only"}, + out: "2025/11/16 22:11:47 ERROR : remote:/logs: directory not found", + err: errors.New("exit status 3"), + }, + }, + } + cs.execCommand = queue.exec + + if got := cs.countLogFiles(context.Background()); got != -1 { + t.Fatalf("countLogFiles() = %d; want -1", got) + } + if len(queue.calls) != 1 { + t.Fatalf("expected 1 rclone call, got %d", len(queue.calls)) + } + + if got := cs.countLogFiles(context.Background()); got != -1 { + t.Fatalf("countLogFiles() second call = %d; want -1", got) + } + if len(queue.calls) != 1 { + t.Fatalf("expected no additional rclone calls, got %d", len(queue.calls)) + } +} + func TestCloudStorageRemoteHelpersAndBuildArgs(t *testing.T) { cfg := &config.Config{ CloudEnabled: true, diff --git a/internal/storage/retention_normalize.go b/internal/storage/retention_normalize.go index 5583123..fa86cf3 100644 --- a/internal/storage/retention_normalize.go +++ b/internal/storage/retention_normalize.go @@ -4,10 +4,10 @@ import ( "github.com/tis24dev/proxsave/internal/logging" ) -// NormalizeGFSRetentionConfig applica le correzioni necessarie alla configurazione -// GFS prima di eseguire la retention. Attualmente: -// - garantisce che il tier DAILY abbia almeno valore 1 (minimo accettato) -// se la policy è gfs e RETENTION_DAILY è <= 0. +// NormalizeGFSRetentionConfig applies the required adjustments to the GFS configuration +// before running retention. Currently: +// - ensures the DAILY tier is at least 1 (minimum accepted value) +// when the policy is gfs and RETENTION_DAILY is <= 0. // - emits a log line to document the adjustment. func NormalizeGFSRetentionConfig(logger *logging.Logger, backendName string, cfg RetentionConfig) RetentionConfig { if cfg.Policy != "gfs" { diff --git a/internal/types/common.go b/internal/types/common.go index e8e4b6c..97159b8 100644 --- a/internal/types/common.go +++ b/internal/types/common.go @@ -2,7 +2,7 @@ package types import "time" -// ProxmoxType rappresenta il tipo di ambiente Proxmox +// ProxmoxType represents the type of Proxmox environment. type ProxmoxType string const ( @@ -12,67 +12,67 @@ const ( // ProxmoxBS - Proxmox Backup Server ProxmoxBS ProxmoxType = "pbs" - // ProxmoxUnknown - Tipo sconosciuto o non rilevato + // ProxmoxUnknown - Unknown or undetected type ProxmoxUnknown ProxmoxType = "unknown" ) -// String restituisce la rappresentazione stringa del tipo Proxmox +// String returns the string representation of the Proxmox type. func (p ProxmoxType) String() string { return string(p) } -// CompressionType rappresenta il tipo di compressione +// CompressionType represents the compression type. type CompressionType string const ( - // CompressionGzip - Compressione gzip + // CompressionGzip - gzip compression CompressionGzip CompressionType = "gz" - // CompressionPigz - Compressione gzip parallela (pigz) + // CompressionPigz - parallel gzip compression (pigz) CompressionPigz CompressionType = "pigz" - // CompressionBzip2 - Compressione bzip2 + // CompressionBzip2 - bzip2 compression CompressionBzip2 CompressionType = "bz2" - // CompressionXZ - Compressione xz (LZMA) + // CompressionXZ - xz compression (LZMA) CompressionXZ CompressionType = "xz" - // CompressionLZMA - Compressione lzma classica + // CompressionLZMA - classic lzma compression CompressionLZMA CompressionType = "lzma" - // CompressionZstd - Compressione zstd + // CompressionZstd - zstd compression CompressionZstd CompressionType = "zst" - // CompressionNone - Nessuna compressione + // CompressionNone - no compression CompressionNone CompressionType = "none" ) -// String restituisce la rappresentazione stringa del tipo di compressione +// String returns the string representation of the compression type. func (c CompressionType) String() string { return string(c) } -// BackupInfo contiene informazioni su un backup +// BackupInfo contains information about a backup. type BackupInfo struct { - // Timestamp del backup + // Backup timestamp Timestamp time.Time - // Nome del file di backup + // Backup file name Filename string - // Dimensione del file in bytes + // File size in bytes Size int64 - // Checksum SHA256 + // SHA256 checksum Checksum string - // Tipo di compressione usata + // Compression type used Compression CompressionType - // Path completo del file + // Full file path Path string - // Tipo di ambiente Proxmox + // Proxmox environment type ProxmoxType ProxmoxType } @@ -101,49 +101,49 @@ type BackupMetadata struct { Version string } -// StorageLocation rappresenta una destinazione di storage +// StorageLocation represents a storage destination. type StorageLocation string const ( - // StorageLocal - Storage locale + // StorageLocal - Local storage StorageLocal StorageLocation = "local" - // StorageSecondary - Storage secondario + // StorageSecondary - Secondary storage StorageSecondary StorageLocation = "secondary" - // StorageCloud - Storage cloud (rclone) + // StorageCloud - Cloud storage (rclone) StorageCloud StorageLocation = "cloud" ) -// String restituisce la rappresentazione stringa della location +// String returns the string representation of the location. func (s StorageLocation) String() string { return string(s) } -// LogLevel rappresenta il livello di logging +// LogLevel represents the logging level. type LogLevel int const ( - // LogLevelDebug - Log di debug (massimo dettaglio) + // LogLevelDebug - Debug logs (maximum detail) LogLevelDebug LogLevel = 5 - // LogLevelInfo - Informazioni generali + // LogLevelInfo - General information LogLevelInfo LogLevel = 4 - // LogLevelWarning - Avvisi + // LogLevelWarning - Warnings LogLevelWarning LogLevel = 3 - // LogLevelError - Errori + // LogLevelError - Errors LogLevelError LogLevel = 2 - // LogLevelCritical - Errori critici + // LogLevelCritical - Critical errors LogLevelCritical LogLevel = 1 - // LogLevelNone - Nessun log + // LogLevelNone - No logs LogLevelNone LogLevel = 0 ) -// String restituisce la rappresentazione stringa del log level +// String returns the string representation of the log level. func (l LogLevel) String() string { switch l { case LogLevelDebug: diff --git a/pkg/utils/files.go b/pkg/utils/files.go index 27c8b50..40f1ed7 100644 --- a/pkg/utils/files.go +++ b/pkg/utils/files.go @@ -8,7 +8,7 @@ import ( "path/filepath" ) -// FileExists verifica se un file esiste +// FileExists checks whether a file exists. func FileExists(path string) bool { info, err := os.Stat(path) if err != nil { @@ -17,7 +17,7 @@ func FileExists(path string) bool { return !info.IsDir() } -// DirExists verifica se una directory esiste +// DirExists checks whether a directory exists. func DirExists(path string) bool { info, err := os.Stat(path) if err != nil { @@ -26,7 +26,7 @@ func DirExists(path string) bool { return info.IsDir() } -// EnsureDir crea una directory se non esiste +// EnsureDir creates a directory if it doesn't exist. func EnsureDir(path string) error { if DirExists(path) { return nil @@ -34,7 +34,7 @@ func EnsureDir(path string) error { return os.MkdirAll(path, 0755) } -// ComputeSHA256 calcola il checksum SHA256 di un file +// ComputeSHA256 computes the SHA256 checksum of a file. func ComputeSHA256(filePath string) (string, error) { file, err := os.Open(filePath) if err != nil { @@ -50,7 +50,7 @@ func ComputeSHA256(filePath string) (string, error) { return fmt.Sprintf("%x", hash.Sum(nil)), nil } -// GetFileSize restituisce la dimensione di un file in bytes +// GetFileSize returns the file size in bytes. func GetFileSize(filePath string) (int64, error) { info, err := os.Stat(filePath) if err != nil { @@ -59,7 +59,7 @@ func GetFileSize(filePath string) (int64, error) { return info.Size(), nil } -// AbsPath restituisce il path assoluto di un file +// AbsPath returns the absolute path of a file. func AbsPath(path string) (string, error) { absPath, err := filepath.Abs(path) if err != nil { @@ -68,7 +68,7 @@ func AbsPath(path string) (string, error) { return absPath, nil } -// IsAbsPath verifica se un path è assoluto +// IsAbsPath checks whether a path is absolute. func IsAbsPath(path string) bool { return filepath.IsAbs(path) } diff --git a/pkg/utils/strings.go b/pkg/utils/strings.go index 09509f9..93753c7 100644 --- a/pkg/utils/strings.go +++ b/pkg/utils/strings.go @@ -7,7 +7,7 @@ import ( "strings" ) -// FormatBytes converte bytes in formato human-readable (KB, MB, GB, ecc.) +// FormatBytes converts bytes to a human-readable format (KB, MB, GB, etc.). func FormatBytes(bytes int64) string { const unit = 1024 if bytes < unit { @@ -21,13 +21,13 @@ func FormatBytes(bytes int64) string { return fmt.Sprintf("%.1f %cB", float64(bytes)/float64(div), "KMGTPE"[exp]) } -// ParseBool converte una stringa in booleano (supporta vari formati) +// ParseBool converts a string to a boolean (supports multiple formats). func ParseBool(s string) bool { s = strings.ToLower(strings.TrimSpace(s)) return s == "true" || s == "1" || s == "yes" || s == "on" || s == "enabled" } -// TrimQuotes rimuove le virgolette da una stringa +// TrimQuotes removes surrounding quotes from a string. func TrimQuotes(s string) string { s = strings.TrimSpace(s) if len(s) >= 2 { @@ -38,8 +38,8 @@ func TrimQuotes(s string) string { return s } -// SplitKeyValue divide una stringa "key=value" in key e value -// Supporta anche commenti inline: KEY="value" # comment +// SplitKeyValue splits a "key=value" string into key and value. +// Supports inline comments too: KEY="value" # comment func SplitKeyValue(line string) (string, string, bool) { parts := strings.SplitN(line, "=", 2) if len(parts) != 2 { @@ -70,7 +70,7 @@ func SplitKeyValue(line string) (string, string, bool) { return key, value, true } -// IsComment verifica se una linea è un commento (inizia con #) +// IsComment checks whether a line is a comment (starts with #). func IsComment(line string) bool { trimmed := strings.TrimSpace(line) return strings.HasPrefix(trimmed, "#") || trimmed == ""