Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions .github/actions/run-tests/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,26 @@ name: Run Restate Jepsen tests
description: "Runs Restate Jepsen test suite. Assumes you have already called the `setup` action."
inputs:
restateImageId:
description: 'Restate image (ignored if PR set)'
description: "Restate image (ignored if PR set)"
required: false
default: 'ghcr.io/restatedev/restate:main'
default: "ghcr.io/restatedev/restate:main"
restatePr:
description: 'Use CI Docker image from PR (ignored if commit is set)'
description: "Use CI Docker image from PR (ignored if commit is set)"
required: false
restateCommit:
description: 'Use CI Docker image from Restate commit'
description: "Use CI Docker image from Restate commit"
required: false
clusterName:
description: 'Jepsen workers cluster AWS stack name'
description: "Jepsen workers cluster AWS stack name"
required: true
testConfig:
description: 'Jepsen test run configuration'
description: "Jepsen test run configuration"
required: false
default: '{ "workloads": "set-vo", "nemeses": "partition-random-node", "duration": "60", "rate": "100", "concurrency": "5n", "testCount": "50" }'
default: '{ "workloads": "set-vo", "nemeses": "partition-random-node", "duration": "60", "rate": "100", "concurrency": "5n", "testCount": "50" }'
retainCluster:
description: "Retain AWS cluster after Jepsen run"
required: false
default: "false"
workDir:
description: "Test suite checkout location/Jepsen working directory"
required: false
Expand Down Expand Up @@ -58,11 +62,11 @@ runs:
echo "Checking for node connectivity..."
CONNECTIVITY_MAX_RETRIES=5
CONNECTIVITY_RETRY_DELAY=15

while IFS= read -r node || [[ -n "$node" ]]; do
if [[ -n "$node" && ! "$node" =~ ^[[:space:]]*# ]]; then
echo -n " Checking $node... "

CONNECTIVITY_SUCCESS=false
for attempt in $(seq 1 $CONNECTIVITY_MAX_RETRIES); do
if timeout 120 bash -c "</dev/tcp/$node/22" 2>/dev/null; then
Expand All @@ -78,7 +82,7 @@ runs:
fi
fi
done

if [ "$CONNECTIVITY_SUCCESS" = false ]; then
echo "::error::Cannot connect to SSH port on $node after $CONNECTIVITY_MAX_RETRIES attempts"
exit 1
Expand Down Expand Up @@ -119,6 +123,7 @@ runs:
--nodes-file aws/nodes.txt \
--username admin \
--ssh-private-key aws/private-key.pem \
--dynamodb-table "$(jq -r 'keys[0] as $stack_name | .[$stack_name].DynamoDbMetadataTableName' aws/cdk-outputs.json)" \
--snapshot-bucket "$(jq -r 'keys[0] as $stack_name | .[$stack_name].BucketName' aws/cdk-outputs.json)" \
--metadata-bucket "$(jq -r 'keys[0] as $stack_name | .[$stack_name].BucketName' aws/cdk-outputs.json)" \
--workload ${WORKLOAD} --nemesis ${NEMESIS} \
Expand Down
5 changes: 4 additions & 1 deletion .github/actions/setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ inputs:
bucketName:
description: "Storage bucket for cluster - will be created on-demand if unset"
required: false
tableName:
description: "DynamoDB table name for metadata - will be created on-demand if unset"
required: false
workDir:
description: "Test suite checkout location/Jepsen working directory"
required: false
Expand Down Expand Up @@ -82,4 +85,4 @@ runs:
working-directory: ./${{ inputs.workDir }}
run: |
SOURCE_IP=$(curl -s https://checkip.amazonaws.com)
just create-aws-cluster "${{ inputs.clusterName }}" "${SOURCE_IP}/32" "${{ inputs.bucketName }}"
just create-aws-cluster "${{ inputs.clusterName }}" "${SOURCE_IP}/32" "${{ inputs.bucketName }}" "${{ inputs.tableName }}"
5 changes: 4 additions & 1 deletion .github/actions/teardown/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ inputs:
bucketName:
description: "Storage bucket for cluster - will be created on-demand if unset"
required: false
tableName:
description: "DynamoDB table name for metadata - will be created on-demand if unset"
required: false

runs:
using: "composite"
Expand All @@ -15,4 +18,4 @@ runs:
- name: Destroy cluster
shell: bash
working-directory: ./jepsen
run: just destroy-aws-cluster "${{ inputs.clusterName }}" "${{ inputs.bucketName }}"
run: just destroy-aws-cluster "${{ inputs.clusterName }}" "${{ inputs.bucketName }}" "${{ inputs.tableName }}"
102 changes: 51 additions & 51 deletions .github/workflows/jepsen.yml
Original file line number Diff line number Diff line change
@@ -1,37 +1,44 @@
name: Run Restate Jepsen tests
on:
push:
branches: [ "main" ]
branches: ["main"]
pull_request:
schedule:
- cron: '0 0 * * *'
- cron: "0 0 * * *"
workflow_dispatch:
inputs:
ref:
description: "Git ref for test suite"
required: false
default: 'main'
default: "main"
restateCommit:
description: 'Git commit id (uses restatedev/restate Docker artifact)'
description: "Git commit id (uses restatedev/restate Docker artifact)"
required: false
default: ''
default: ""
type: string
restatePr:
description: 'Test against PR Docker artifact (commit takes precedence)'
description: "Test against PR Docker artifact (commit takes precedence)"
required: false
default: ''
default: ""
type: string
restateImageId:
description: 'Docker image (PR#/commit take precedence; defaults to ghcr.io/restatedev/restate:main)'
description: "Docker image (PR#/commit take precedence; defaults to ghcr.io/restatedev/restate:main)"
required: false
default: 'ghcr.io/restatedev/restate:main'
default: "ghcr.io/restatedev/restate:main"
type: string
testConfig:
description: 'Test configuration parameters'
description: "Test configuration parameters"
required: false
default: '{"workloads": "set-vo set-mds set-mds-s3", "nemeses": "partition-random-node", "duration": "60", "rate": "100", "concurrency": "5n", "testCount": "20"}'
# default: '{"workloads": "set-vo set-mds set-mds-s3 set-mds-ddb", "nemeses": "partition-random-node", "duration": "60", "rate": "100", "concurrency": "5n", "testCount": "20"}'
# TODO: revert before merging
default: '{"workloads": "set-mds-ddb", "nemeses": "partition-random-node", "duration": "60", "rate": "100", "concurrency": "5n", "testCount": "1"}'
clusterName:
description: "Jepsen workers cluster AWS stack name"
required: false
type: string
retainCluster:
description: 'Retain the AWS worker node cluster after test'
description: "Retain AWS cluster after Jepsen run"
default: true
required: false
type: boolean

Expand All @@ -45,47 +52,40 @@ jobs:
if: github.event.repository.fork == false
runs-on: warp-ubuntu-latest-x64-4x
env:
CLUSTER_NAME: restatedev-jepsen-${{ github.event.pull_request.number != null && format('pr{0}', github.event.pull_request.number) || format('run{0}', github.run_id) }}
CLUSTER_NAME: ${{ inputs.clusterName || format('restatedev-jepsen-{0}', github.event.pull_request.number != null && format('pr-{0}', github.event.pull_request.number) || format('run-{0}', github.run_id)) }}

steps:
- uses: actions/checkout@v4
with:
repository: restatedev/jepsen
sparse-checkout: |
.github
- uses: actions/checkout@v4
with:
repository: restatedev/jepsen
sparse-checkout: |
.github

- uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1 # co-locate with WarpBuild worker
role-to-assume: 'arn:aws:iam::307946634685:role/github-restatedev-jepsen-actions-role'
- name: Setup Jepsen cluster ${{ env.CLUSTER_NAME }}
uses: ./.github/actions/setup
with:
ref: ${{ inputs.ref }}
clusterName: ${{ env.CLUSTER_NAME }}
bucketName: "restate-jepsen-tests-us-east-1"
- name: Drop AWS credentials
run: |
echo "AWS_ACCESS_KEY_ID=" >> $GITHUB_ENV
echo "AWS_SECRET_ACCESS_KEY=" >> $GITHUB_ENV
echo "AWS_SESSION_TOKEN=" >> $GITHUB_ENV
- uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1 # co-locate with WarpBuild worker
role-to-assume: "arn:aws:iam::307946634685:role/github-restatedev-jepsen-actions-role"
- name: Setup Jepsen cluster ${{ env.CLUSTER_NAME }}
uses: ./.github/actions/setup
with:
ref: ${{ inputs.ref }}
clusterName: ${{ env.CLUSTER_NAME }}
bucketName: "restate-jepsen-tests-us-east-1"
tableName: "restate-jepsen-tests"

- name: Run Jepsen tests
uses: ./.github/actions/run-tests
with:
restateImageId: ${{ inputs.restateImageId }}
restatePr: ${{ inputs.restatePr }}
restateCommit: ${{ inputs.restateCommit }}
testConfig: ${{ inputs.testConfig || '{"workloads":"set-vo set-mds set-mds-s3","nemeses":"partition-random-node","duration":"60","rate":"100","concurrency":"5n","testCount":"20"}' }}
- name: Run Jepsen tests
uses: ./.github/actions/run-tests
with:
restateImageId: ${{ inputs.restateImageId }}
restatePr: ${{ inputs.restatePr }}
restateCommit: ${{ inputs.restateCommit }}
testConfig: ${{ inputs.testConfig || '{"workloads":"set-vo set-mds set-mds-s3","nemeses":"partition-random-node","duration":"60","rate":"100","concurrency":"5n","testCount":"20"}' }}
retainCluster: ${{ inputs.retainCluster || '"false"' }}

- uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: "arn:aws:iam::307946634685:role/github-restatedev-jepsen-actions-role"
if: always()
- name: Tear down Jepsen cluster ${{ env.CLUSTER_NAME }}
uses: ./.github/actions/teardown
if: always()
with:
clusterName: ${{ env.CLUSTER_NAME }}
bucketName: "restate-jepsen-tests-us-east-1"
- name: Tear down Jepsen cluster ${{ env.CLUSTER_NAME }}
uses: ./.github/actions/teardown
if: ${{ always() && !(inputs.retainCluster || false) }}
with:
clusterName: ${{ env.CLUSTER_NAME }}
bucketName: "restate-jepsen-tests-us-east-1"
tableName: "restate-jepsen-tests"
21 changes: 19 additions & 2 deletions aws/cluster.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
*/

import * as cdk from "aws-cdk-lib";
import { aws_ec2 as ec2, aws_iam as iam, aws_s3 as s3 } from "aws-cdk-lib";
import { aws_ec2 as ec2, aws_iam as iam, aws_s3 as s3, aws_dynamodb as ddb } from "aws-cdk-lib";

const app = new cdk.App();

Expand All @@ -22,6 +22,7 @@ const instanceType = ec2.InstanceType.of(ec2.InstanceClass.T3, ec2.InstanceSize.
// if you have existing buckets, pass their names into the stack and the workers will be granted access;
// if unset, unique buckets will be created as part of deploying the stack
const bucketName = app.node.tryGetContext("bucket-name");
const tableName = app.node.tryGetContext("table-name");

// --- no configuration past this point ---

Expand Down Expand Up @@ -59,7 +60,23 @@ if (bucketName) {
});
}
bucket.grantReadWrite(instanceRole);
new cdk.CfnOutput(stack, `BucketName`, { value: bucket.bucketName });
new cdk.CfnOutput(stack, "BucketName", { value: bucket.bucketName });

let table: ddb.ITable;
if (tableName) {
table = ddb.Table.fromTableName(stack, "DynamoDbMetadataTable", tableName);
} else {
table = new ddb.Table(stack, "DynamoDbMetadataTable", {
partitionKey: {
name: "kind",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have changed the partition key name to pk instead of kind (key was a reserved word and did not work)

Suggested change
name: "kind",
name: "pk",

type: ddb.AttributeType.STRING,
},
billingMode: ddb.BillingMode.PAY_PER_REQUEST,
removalPolicy: cdk.RemovalPolicy.DESTROY,
});
}
table.grantReadWriteData(instanceRole);
new cdk.CfnOutput(stack, "DynamoDbMetadataTableName", { value: table.tableName });

function addNodeInstance(n: number) {
const cloudConfig = ec2.UserData.custom([`cloud_final_modules:`, `- [scripts-user, once]`].join("\n"));
Expand Down
16 changes: 12 additions & 4 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,34 @@ make-services:
npm clean-install
npm run bundle

create-aws-cluster stack-name="" allow-source-cidr="0.0.0.0/0" bucket-name="":
create-aws-cluster stack-name="" allow-source-cidr="0.0.0.0/0" bucket-name="" table-name="":
#!/usr/bin/env bash
set -e
cd aws
npm clean-install
npm run deploy -- --context stack-name={{stack-name}} --context allow-source-cidr={{allow-source-cidr}} --context bucket-name={{bucket-name}}
npm run deploy -- \
--context stack-name={{stack-name}} \
--context allow-source-cidr={{allow-source-cidr}} \
--context bucket-name={{bucket-name}} \
--context table-name={{table-name}}
bash get-node-info.sh

destroy-aws-cluster stack-name="" bucket-name="":
destroy-aws-cluster stack-name="" bucket-name="" table-name="":
#!/usr/bin/env bash
set -e
cd aws
npm run destroy -- --context stack-name={{stack-name}} --context bucket-name={{bucket-name}}
npm run destroy --\
--context stack-name={{stack-name}} \
--context bucket-name={{bucket-name}} \
--context table-name={{table-name}}

run-test workload="set-vo" nemesis="partition-random-node" image="ghcr.io/restatedev/restate:main":
#!/usr/bin/env bash
set -e
# NB: we should use unique prefixes for each test run so that we don't have to wipe the bucket contents
lein run test --nodes-file aws/nodes.txt --username admin --ssh-private-key aws/private-key.pem \
--image {{image}} \
--dynamodb-table "$(jq -r 'keys[0] as $stack_name | .[$stack_name].DynamoDbMetadataTableName' aws/cdk-outputs.json)" \
--metadata-bucket "$(jq -r 'keys[0] as $stack_name | .[$stack_name].BucketName' aws/cdk-outputs.json)" \
--snapshot-bucket "$(jq -r 'keys[0] as $stack_name | .[$stack_name].BucketName' aws/cdk-outputs.json)" \
--leave-db-running true \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
roles = ["admin", "worker", "log-server"]
roles = ["admin", "worker", "log-server", "http-ingress"]
tracing-filter = "info"
log-filter = "restate=info,slog=info,info"
log-format = "compact"
Expand All @@ -13,8 +13,7 @@ log-trim-check-interval = "5s"
log-tail-update-interval = "3s"

[metadata-client]
type = "object-store"
path = "s3://<metadata-bucket>" # set via environment
# type / path / table-name are set by the workload via environment overrides

[bifrost]
default-provider = "replicated"
Expand Down
2 changes: 1 addition & 1 deletion resources/restate-server.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
roles = ["admin", "metadata-server", "worker", "log-server"]
roles = ["admin", "metadata-server", "worker", "log-server", "http-ingress",]
tracing-filter = "info"
log-filter = "restate=info,slog=info,info"
log-format = "compact"
Expand Down
Loading
Loading