Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions tests/e2e/backup_restore_cli_suite_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package e2e_test

import (
"context"
"fmt"
"log"
"strings"
"time"

"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/openshift/oadp-operator/tests/e2e/lib"
)
Expand Down Expand Up @@ -173,6 +175,21 @@ func runApplicationBackupAndRestoreViaCLI(brCase ApplicationBackupRestoreCase, u
// run restore via CLI
runRestoreViaCLI(brCase.BackupRestoreCase, backupName, restoreName, nsRequiredResticDCWorkaround)

// For file-system backup restores (KOPIA/restic), the restored pods may have
// broken networking because OVN-Kubernetes doesn't fully wire the network
// namespace for pods recreated by Velero with a restore-wait init container.
// Deleting the pods lets the deployment controller create fresh ones with
// proper networking while preserving the restored PVC data.
Comment on lines +178 to +182
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this doc'd in velero?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

um... ur the maintainer :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kaovilai I can easily recreate the issue now, perhaps we need a new clean upstream velero bug and doc :)

if brCase.BackupRestoreType == lib.KOPIA {
log.Printf("Restarting pods in namespace %s to ensure proper networking after file-system restore", brCase.Namespace)
err = kubernetesClientForSuiteRun.CoreV1().Pods(brCase.Namespace).DeleteCollection(
context.Background(),
metav1.DeleteOptions{},
metav1.ListOptions{LabelSelector: "e2e-app=true"},
)
gomega.Expect(err).ToNot(gomega.HaveOccurred())
}

// Run optional custom verification
if brCase.PostRestoreVerify != nil {
log.Printf("Running post-restore custom function for case %s", brCase.Name)
Expand Down
15 changes: 15 additions & 0 deletions tests/e2e/backup_restore_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,21 @@ func runApplicationBackupAndRestore(brCase ApplicationBackupRestoreCase, updateL
// run restore
runRestore(brCase.BackupRestoreCase, backupName, restoreName, nsRequiredResticDCWorkaround)

// For file-system backup restores (KOPIA/restic), the restored pods may have
// broken networking because OVN-Kubernetes doesn't fully wire the network
// namespace for pods recreated by Velero with a restore-wait init container.
// Deleting the pods lets the deployment controller create fresh ones with
// proper networking while preserving the restored PVC data.
if brCase.BackupRestoreType == lib.KOPIA {
log.Printf("Restarting pods in namespace %s to ensure proper networking after file-system restore", brCase.Namespace)
err = kubernetesClientForSuiteRun.CoreV1().Pods(brCase.Namespace).DeleteCollection(
context.Background(),
metav1.DeleteOptions{},
metav1.ListOptions{LabelSelector: "e2e-app=true"},
)
gomega.Expect(err).ToNot(gomega.HaveOccurred())
}

// Run optional custom verification
if brCase.PostRestoreVerify != nil {
log.Printf("Running post-restore custom function for case %s", brCase.Name)
Expand Down
128 changes: 119 additions & 9 deletions tests/e2e/lib/apps.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,9 @@ func RunMustGather(artifact_dir string, clusterClient client.Client) error {
}

// VerifyBackupRestoreData verifies if app ready before backup and after restore to compare data.
func VerifyBackupRestoreData(ocClient client.Client, kubeClient *kubernetes.Clientset, kubeConfig *rest.Config, artifactDir string, namespace string, routeName string, serviceName string, app string, prebackupState bool, twoVol bool) error {
// skipReadyz skips the post-restore readyz endpoint check (use for VM-based tests where the
// app route is not directly reachable from the test harness).
func VerifyBackupRestoreData(ocClient client.Client, kubeClient *kubernetes.Clientset, kubeConfig *rest.Config, artifactDir string, namespace string, routeName string, serviceName string, app string, prebackupState bool, twoVol bool, skipReadyz ...bool) error {
log.Printf("Verifying backup/restore data of %s", app)
appEndpointURL, proxyPodParams, err := getAppEndpointURLAndProxyParams(ocClient, kubeClient, kubeConfig, namespace, serviceName, routeName)
log.Printf("App endpoint URL: %s", appEndpointURL)
Expand Down Expand Up @@ -490,19 +492,107 @@ func VerifyBackupRestoreData(ocClient client.Client, kubeClient *kubernetes.Clie
return err
}
} else {
//restore check
// --- Restore verification ---
// After a Velero restore, verify that the application is serving the same data
// that was captured before the backup (stored in backup-data.txt).
//
// Flow for todo apps (mysql-persistent / mongo-persistent):
// 1. If !shouldSkipReadyz: poll /healthz to confirm the app is alive.
// 2. Fetch /todo-incomplete to get the current data for comparison.
// - For VM-based tests (shouldSkipReadyz=true) the route is not directly
// reachable until the VM finishes cloud-init, so we poll with retries.
// - For container-based tests a single request suffices after healthz passes.
// Flow for parks-app: single GET /clicks.
// Finally, compare the fetched data against backup-data.txt.

shouldSkipReadyz := len(skipReadyz) > 0 && skipReadyz[0]
isTodoApp := namespace == "mysql-persistent" || namespace == "mongo-persistent"

// Step 1: healthz gate (container-based todo apps only).
// Polls /healthz to confirm the app is alive and the HTTP server is responding.
// The todo2-go app exposes /healthz (used by all K8s probes) and /readyz (returns
// 503 until DB is connected). We use /healthz here because it matches the probe
// configuration in the app manifests and becomes available immediately on startup.
// Skipped for VM tests where the app runs inside a Fedora/CentOS VM and the
// OpenShift route proxies to a different service topology.
if isTodoApp && !shouldSkipReadyz {
// MakeRequest can return err == nil for HTTP 5xx when using the proxy (curl),
// so we validate the response body and errResp via isHealthzAlive.
requestParams := getRequestParameters(appEndpointURL+"/healthz", proxyPodParams, GET, nil)
const maxHealthzAttempts = 5
for attempt := 1; attempt <= maxHealthzAttempts; attempt++ {
log.Printf("healthz check attempt %d/%d: GET %s/healthz\n", attempt, maxHealthzAttempts, appEndpointURL)
respData, errResp, err = MakeRequest(*requestParams)
if err == nil && isHealthzAlive(respData, errResp) {
log.Printf("healthz endpoint is alive (attempt %d/%d): %s\n", attempt, maxHealthzAttempts, respData)
break
}
if err != nil {
if errResp != "" {
log.Printf("Request response error msg: %s\n", errResp)
}
} else {
log.Printf("healthz attempt %d/%d: response not healthy (body=%q, errResp=%q)\n", attempt, maxHealthzAttempts, respData, errResp)
}
if attempt == maxHealthzAttempts {
log.Printf("healthz endpoint did not become alive after %d attempts: %v\n", maxHealthzAttempts, err)
if err != nil {
return err
}
return fmt.Errorf("healthz did not return healthy response after %d attempts (last body=%q, errResp=%q)", maxHealthzAttempts, respData, errResp)
}
backoff := time.Duration(attempt) * 5 * time.Second
log.Printf("healthz attempt %d/%d failed, retrying in %s: %v\n", attempt, maxHealthzAttempts, backoff, err)
time.Sleep(backoff)
}
}

if namespace == "mysql-persistent" || namespace == "mongo-persistent" {
// Make request to the "todo-incomplete" endpoint
// Step 2: fetch /todo-incomplete data for todo apps.
// In the VM (shouldSkipReadyz) case we skipped the readyz gate above, so the
// app may not be ready yet. Poll with retries and increasing backoff.
// In the container case healthz already passed, so one attempt is enough.
if isTodoApp {
requestParamsTodoIncomplete := getRequestParameters(appEndpointURL+"/todo-incomplete", proxyPodParams, GET, nil)
respData, errResp, err = MakeRequest(*requestParamsTodoIncomplete)
if err != nil {
if errResp != "" {
log.Printf("Request response error msg: %s\n", errResp)
maxTodoAttempts := 1
todoBackoffSec := 0
if shouldSkipReadyz {
maxTodoAttempts = 10
todoBackoffSec = 10
}
for attempt := 1; attempt <= maxTodoAttempts; attempt++ {
if maxTodoAttempts > 1 {
log.Printf("Polling app endpoint attempt %d/%d: GET %s/todo-incomplete", attempt, maxTodoAttempts, appEndpointURL)
}
return err
respData, errResp, err = MakeRequest(*requestParamsTodoIncomplete)
success := err == nil && (maxTodoAttempts == 1 || len(bytes.TrimSpace([]byte(respData))) > 0)
if success {
if maxTodoAttempts > 1 {
log.Printf("VIRT App endpoint responded with data (attempt %d/%d): %s", attempt, maxTodoAttempts, respData)
}
break
}
if attempt == maxTodoAttempts {
if err != nil {
if errResp != "" {
log.Printf("Request response error msg: %s\n", errResp)
}
return err
}
if maxTodoAttempts > 1 {
log.Printf("VIRT App endpoint returned empty data after %d attempts", maxTodoAttempts)
return errors.New("VIRT App endpoint returned empty data after max attempts")
}
if errResp != "" {
log.Printf("Request response error msg: %s\n", errResp)
}
return err
}
backoff := time.Duration(attempt) * time.Duration(todoBackoffSec) * time.Second
log.Printf("VIRT Attempt %d/%d: no data yet, retrying in %s (err=%v, resp=%q)", attempt, maxTodoAttempts, backoff, err, respData)
time.Sleep(backoff)
}
}

if namespace == "parks-app" {
// Make request to the "clicks" endpoint
responseParams := getRequestParameters(appEndpointURL+"/clicks", proxyPodParams, GET, nil)
Expand Down Expand Up @@ -539,6 +629,26 @@ func VerifyBackupRestoreData(ocClient client.Client, kubeClient *kubernetes.Clie
return nil
}

// errRespIndicatesHTTPError returns true when errResp contains HTTP error indicators (e.g. 5xx from MakeRequest).
func errRespIndicatesHTTPError(errResp string) bool {
if errResp == "" {
return false
}
return strings.Contains(errResp, "HTTP request failed") ||
strings.Contains(errResp, "status code") ||
strings.Contains(errResp, "500") ||
strings.Contains(errResp, "502") ||
strings.Contains(errResp, "503")
}

// isHealthzAlive returns true when the /healthz response indicates the app is
// alive: the response body is non-empty (any content is fine) and errResp
// does not contain HTTP error indicators.
func isHealthzAlive(respData, errResp string) bool {
return strings.TrimSpace(respData) != "" &&
!errRespIndicatesHTTPError(errResp)
}

func getRequestParameters(url string, proxyPodParams *ProxyPodParameters, method HTTPMethod, payload *string) *RequestParameters {
return &RequestParameters{
ProxyPodParams: proxyPodParams,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@ items:
supplementalGroups:
type: RunAsAny
volumes:
- '*'
- persistentVolumeClaim
- secret
- configMap
- downwardAPI
- projected
- emptyDir
users:
- system:admin
- system:serviceaccount:mongo-persistent:mongo-persistent-sa
Expand Down Expand Up @@ -78,18 +83,21 @@ items:
securityContext:
runAsUser: 0
# Format the block device on first use so MongoDB can use the filesystem
# Use same app image; install e2fsprogs then format (mongo:7 base has apt)
initContainers:
- image: quay.io/migtools/oadp-ci-todolist-mongo-go-testing:latest
- image: quay.io/migtools/oadp-ci-todo2-go-testing-mongodb:latest
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
name: setup-block-device
command:
- "sh"
- "bash"
- "-c"
- |
DEVICE="/dev/xvdx"
MOUNT_POINT="/data/db"
set -e
apt-get update -qq && apt-get install -y -qq e2fsprogs
DEVICE="/dev/block-pv"
MOUNT_POINT="/var/lib/mongodb"
if [ ! -e $DEVICE ]; then
echo "$DEVICE does not exist."
exit 1
Expand All @@ -108,38 +116,36 @@ items:
umount $MOUNT_POINT
volumeDevices:
- name: block-volume-pv
devicePath: /dev/xvdx
devicePath: /dev/block-pv
containers:
- name: todolist
image: quay.io/migtools/oadp-ci-todolist-mongo-go-testing:latest
image: quay.io/migtools/oadp-ci-todo2-go-testing-mongodb:latest
securityContext:
privileged: true
env:
- name: MONGO_INITDB_ROOT_USERNAME
value: changeme
- name: MONGO_INITDB_ROOT_PASSWORD
value: changeme
- name: MONGO_INITDB_DATABASE
- name: DB_BACKEND
value: mongodb
- name: MONGO_DATABASE
value: todolist
ports:
- containerPort: 8000
protocol: TCP
resources:
limits:
memory: 512Mi
# Block mode: mount the block device and use it for /data/db
# Block mode: mount the block device and use it for /var/lib/mongodb
command:
- "sh"
- "-c"
- |
DEVICE="/dev/xvdx"
MOUNT_POINT="/data/db"
DEVICE="/dev/block-pv"
MOUNT_POINT="/var/lib/mongodb"
mkdir -p $MOUNT_POINT
mount $DEVICE $MOUNT_POINT
exec /opt/todolist/entrypoint.sh
volumeDevices:
- name: block-volume-pv
devicePath: /dev/xvdx
devicePath: /dev/block-pv
startupProbe:
httpGet:
path: /healthz
Expand All @@ -157,7 +163,7 @@ items:
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /readyz
port: 8000
initialDelaySeconds: 10
periodSeconds: 5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ items:
serviceAccountName: mongo-persistent-sa
containers:
- name: todolist
image: quay.io/migtools/oadp-ci-todolist-mongo-go-testing:latest
image: quay.io/migtools/oadp-ci-todo2-go-testing-mongodb:latest
securityContext:
privileged: false
allowPrivilegeEscalation: false
Expand All @@ -87,11 +87,9 @@ items:
seccompProfile:
type: RuntimeDefault
env:
- name: MONGO_INITDB_ROOT_USERNAME
value: changeme
- name: MONGO_INITDB_ROOT_PASSWORD
value: changeme
- name: MONGO_INITDB_DATABASE
- name: DB_BACKEND
value: mongodb
- name: MONGO_DATABASE
value: todolist
ports:
- containerPort: 8000
Expand All @@ -101,7 +99,7 @@ items:
memory: 512Mi
volumeMounts:
- name: mongo-data
mountPath: /data/db
mountPath: /var/lib/mongodb
startupProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,11 @@ items:
serviceAccountName: mongo-persistent-sa
containers:
- name: todolist
image: quay.io/migtools/oadp-ci-todolist-mongo-go-testing:latest
image: quay.io/migtools/oadp-ci-todo2-go-testing-mongodb:latest
env:
- name: MONGO_INITDB_ROOT_USERNAME
value: changeme
- name: MONGO_INITDB_ROOT_PASSWORD
value: changeme
- name: MONGO_INITDB_DATABASE
- name: DB_BACKEND
value: mongodb
- name: MONGO_DATABASE
value: todolist
ports:
- containerPort: 8000
Expand All @@ -106,7 +104,7 @@ items:
memory: 512Mi
volumeMounts:
- name: mongo-data
mountPath: /data/db
mountPath: /var/lib/mongodb
startupProbe:
httpGet:
path: /healthz
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,14 @@ items:
serviceAccountName: mysql-persistent-sa
containers:
- name: todolist
image: quay.io/migtools/oadp-ci-todolist-mariadb-go-testing:testing
image: quay.io/migtools/oadp-ci-todo2-go-testing-mariadb:latest
securityContext:
runAsGroup: 27
runAsUser: 27
privileged: true
env:
- name: DB_BACKEND
value: mariadb
- name: MYSQL_USER
value: changeme
- name: MYSQL_PASSWORD
Expand Down
Loading