Skip to content

Commit c009459

Browse files
committed
roachprod: pre-bake custom roachprod cloud images
Prior to this patch, roachprod clusters were created from bare Ubuntu images. This was inadequate for multiple reasons, some of which being: - dependency on third-parties (GCS, APT repositories) availability - spinning up two clusters at a different moment in time could lead to different resulting systems (package versions, ...) and create reproducibility issues - growing number of dependencies installed increases the boot time To address this, this patch creates a new roachprod bake-images command that relies on Hashicorp Packer to pre-bake ready to use cloud images for AWS and GCP. This creates a system dependency on Packer and requires the machine that runs the command to have Packer installed and to be authenticated on AWS and GCP with authorization to create instances and publish new images. If an image already exist, it won't get built again, making re-running roachprod bake-images safe. The pre-baking process creates images for amd64, arm64 and fips, and pushes them to the roachprod compatible regions (only for AWS, since images are globally available in GCP). The images are tagged with a hashed checksum of the startup script, which defines their unique version. At runtime, the providers checksums the startup script to figure out which pre-baked image should be used, and checks for its availability in the cloud provider for that specific region/zone: - if the image exists, it is used to create the instance, and only a subset (runtime) of the startup scripts is executed on the instances, decreasing the startup time to a minimum (5s or so for disk setup) - if the image does not exists, the system fallbacks to using the base image and the whole startup scripts (pre-baking + runtime) is executed on the instances This patch also drops the JSON hardcoded AMI IDs (or names in GCP) and introduces auto-discovery of the base image's most recent version based on the image name/family and owner or project ID. This allows us to automatically keep up to date with the latest patch releases, which usually are security updates. Notes: - this patch only contains implementation for AWS and GCP, and Azure and IBM should also be implemented - a CI mechanism should be built to automatically build all images when there is a change in the startup scripts (either Github upon merge to master or TeamCity nightly runs) - there is currently no built-in way to deprecate/cleanup previous images since they might still be used on older branches; a cleanup routine should be considered if/when the number of images get out of hand Beyond this first iteration, a concept of "pre-bake only snippets" should come next: snippets that are only executed at pre-baking time and not at runtime even if there is no pre-baked image. These snippets would contain adhoc roachtest setups (building/pre-installing third party tools like Prometheus/Grafana, Jepsen, Kafka CLI, ...), which would remove the need for these tests to build/install at third party dependencies at runtime if the test is running on an instance supported by a pre-baked image (see #62066 as an example). Epic: none Informs: #150144 Release note: None
1 parent b86f9f7 commit c009459

File tree

26 files changed

+1983
-427
lines changed

26 files changed

+1983
-427
lines changed

pkg/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1743,6 +1743,7 @@ GO_TARGETS = [
17431743
"//pkg/roachprod/vm/ibm:ibm_test",
17441744
"//pkg/roachprod/vm/local:local",
17451745
"//pkg/roachprod/vm/local:local_test",
1746+
"//pkg/roachprod/vm/utils/packer:packer",
17461747
"//pkg/roachprod/vm:vm",
17471748
"//pkg/roachprod/vm:vm_test",
17481749
"//pkg/roachprod:roachprod",

pkg/cmd/roachprod/cli/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,17 @@ go_library(
2727
"//pkg/roachprod/ssh",
2828
"//pkg/roachprod/ui",
2929
"//pkg/roachprod/vm",
30+
"//pkg/roachprod/vm/aws",
3031
"//pkg/roachprod/vm/gce",
32+
"//pkg/roachprod/vm/utils/packer",
3133
"//pkg/util/envutil",
3234
"//pkg/util/flagutil",
3335
"//pkg/util/timeutil",
3436
"@com_github_cockroachdb_errors//:errors",
3537
"@com_github_cockroachdb_errors//oserror",
3638
"@com_github_fatih_color//:color",
3739
"@com_github_spf13_cobra//:cobra",
40+
"@com_github_spf13_pflag//:pflag",
3841
"@com_google_cloud_go_storage//:storage",
3942
"@org_golang_google_api//option",
4043
"@org_golang_x_crypto//ssh",

pkg/cmd/roachprod/cli/commands.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ import (
2828
"github.com/cockroachdb/cockroach/pkg/roachprod/roachprodutil"
2929
"github.com/cockroachdb/cockroach/pkg/roachprod/ui"
3030
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
31+
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/aws"
3132
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
33+
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/utils/packer"
3234
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
3335
"github.com/cockroachdb/errors"
3436
"github.com/fatih/color"
@@ -2256,3 +2258,132 @@ If a destination is not provided, the certs will be downloaded to a default %s d
22562258
}),
22572259
}
22582260
}
2261+
2262+
func (cr *commandRegistry) buildBakeImageCmd() *cobra.Command {
2263+
var providers []string
2264+
var zones []string
2265+
var project string
2266+
var architectures []string
2267+
var dryRun bool
2268+
2269+
bakeImageCmd := &cobra.Command{
2270+
Use: "bake-images",
2271+
Short: "pre-bake cloud VM images with roachprod configuration",
2272+
Long: `Pre-bake cloud VM images with packages and configuration to speed up cluster creation.
2273+
2274+
This command creates cloud images with all roachprod packages (node_exporter,
2275+
ebpf_exporter, chrony, etc.) pre-installed. When creating clusters with these
2276+
pre-baked images, instance startup time is significantly reduced since packages
2277+
don't need to be downloaded and installed.
2278+
2279+
The image name is generated based on the base Ubuntu image and a checksum of
2280+
the startup template content. If an image with the computed name already exists,
2281+
the command exits successfully without creating a new image.
2282+
2283+
If no provider is specified, images will be built for all supported providers.
2284+
If no architecture is specified, all supported architectures will be built.
2285+
2286+
Examples:
2287+
roachprod bake-image # Build for all providers and architectures
2288+
roachprod bake-image --provider=gce --zones=us-central1-a # GCE only
2289+
roachprod bake-image --provider=aws # AWS only
2290+
roachprod bake-image --provider=gce,aws --arch=amd64,arm64 # Both providers, specific architectures`,
2291+
Args: cobra.NoArgs,
2292+
Run: Wrap(func(cmd *cobra.Command, args []string) error {
2293+
// Default to all providers if none specified
2294+
if len(providers) == 0 {
2295+
providers = []string{"gce", "aws"}
2296+
}
2297+
2298+
// Default to all architectures if none specified
2299+
if len(architectures) == 0 {
2300+
architectures = []string{"amd64", "arm64", "fips"}
2301+
}
2302+
2303+
// Collect sources and provisioners from all providers
2304+
var allSources []packer.SourceConfig
2305+
var allProvisioners []packer.ProvisionerConfig
2306+
var allPlugins []packer.PluginConfig
2307+
2308+
for _, provider := range providers {
2309+
config.Logger.Printf("Preparing images for provider: %s", provider)
2310+
2311+
var sources []packer.SourceConfig
2312+
var provisioners []packer.ProvisionerConfig
2313+
var plugins []packer.PluginConfig
2314+
var err error
2315+
2316+
switch provider {
2317+
case "gce":
2318+
// Get GCE provider instance
2319+
gceProvider, ok := vm.Providers["gce"].(*gce.Provider)
2320+
if !ok {
2321+
return errors.New("GCE provider not initialized")
2322+
}
2323+
2324+
providerOpts := map[string]interface{}{
2325+
"zones": zones,
2326+
"project": project,
2327+
}
2328+
sources, provisioners, plugins, err = gceProvider.GetPackerSources(
2329+
config.Logger, architectures, providerOpts,
2330+
)
2331+
2332+
case "aws":
2333+
// Get AWS provider instance
2334+
awsProvider, ok := vm.Providers["aws"].(*aws.Provider)
2335+
if !ok {
2336+
return errors.New("AWS provider not initialized")
2337+
}
2338+
2339+
providerOpts := map[string]interface{}{}
2340+
sources, provisioners, plugins, err = awsProvider.GetPackerSources(
2341+
config.Logger, architectures, providerOpts,
2342+
)
2343+
2344+
default:
2345+
return errors.Newf("unsupported provider: %s (supported: gce, aws)", provider)
2346+
}
2347+
2348+
if err != nil {
2349+
return errors.Wrapf(err, "failed to get Packer sources for %s", provider)
2350+
}
2351+
2352+
// Accumulate sources, provisioners, and plugins
2353+
allSources = append(allSources, sources...)
2354+
allProvisioners = append(allProvisioners, provisioners...)
2355+
allPlugins = append(allPlugins, plugins...)
2356+
}
2357+
2358+
// If no sources to build, we're done
2359+
if len(allSources) == 0 {
2360+
config.Logger.Printf("All requested images already exist, nothing to build")
2361+
return nil
2362+
}
2363+
2364+
// Build all images in a single Packer run (parallel across providers!)
2365+
config.Logger.Printf("Building images across %d provider(s) in parallel...", len(providers))
2366+
if err := packer.Build(config.Logger, allSources, allProvisioners, allPlugins, dryRun); err != nil {
2367+
return errors.Wrap(err, "packer build failed")
2368+
}
2369+
2370+
config.Logger.Printf("Successfully baked images for all requested providers")
2371+
return nil
2372+
}),
2373+
}
2374+
2375+
bakeImageCmd.Flags().StringSliceVar(&providers, "provider", nil,
2376+
"cloud provider(s) to build images for (gce, aws); if not specified, builds for all")
2377+
bakeImageCmd.Flags().StringSliceVar(&zones, "zones", []string{"us-central1-a"},
2378+
"zones to build the image in (GCE only, uses first zone)")
2379+
bakeImageCmd.Flags().StringVar(&project, "project", gce.DefaultProject(),
2380+
"GCE project to create the image in (GCE only)")
2381+
bakeImageCmd.Flags().StringSliceVar(&architectures, "arch", nil,
2382+
"architectures to build for (amd64, arm64, fips); if not specified, builds all")
2383+
bakeImageCmd.Flags().BoolVar(&dryRun, "dry-run", false,
2384+
"if set, only prints the actions without executing them")
2385+
2386+
cr.addToExcludeFromBashCompletion(bakeImageCmd)
2387+
cr.addToExcludeFromClusterFlagsMulti(bakeImageCmd)
2388+
return bakeImageCmd
2389+
}

pkg/cmd/roachprod/cli/resgistry.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,5 +73,6 @@ func (cr *commandRegistry) register() {
7373
cr.buildFetchLogsCmd(),
7474
cr.buildGetLatestPProfCmd(),
7575
cr.buildFetchCertsDir(),
76+
cr.buildBakeImageCmd(),
7677
})
7778
}

pkg/cmd/roachprod/cli/util.go

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package cli
88
import (
99
"fmt"
1010
"os"
11+
"slices"
1112
"strings"
1213
"text/tabwriter"
1314
"time"
@@ -19,6 +20,7 @@ import (
1920
"github.com/cockroachdb/errors"
2021
"github.com/cockroachdb/errors/oserror"
2122
"github.com/spf13/cobra"
23+
"github.com/spf13/pflag"
2224
)
2325

2426
func PromptYesNo(msg string, defaultYes bool) bool {
@@ -211,16 +213,40 @@ func ValidateAndConfigure(cmd *cobra.Command, args []string) {
211213
}
212214
}
213215

216+
validArchitectures := []vm.CPUArch{vm.ArchAMD64, vm.ArchARM64, vm.ArchFIPS, vm.ArchS390x}
217+
214218
// Validate architecture flag, if set.
215219
if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed {
216-
arch := vm.CPUArch(strings.ToLower(archOpt.Value.String()))
217220

218-
if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS && arch != vm.ArchS390x {
219-
printErrAndExit(fmt.Errorf("unsupported architecture %q", arch))
221+
var architecturesToValidate = []string{}
222+
switch v := archOpt.Value.(type) {
223+
case pflag.SliceValue:
224+
architecturesToValidate = v.GetSlice()
225+
default:
226+
architecturesToValidate = []string{archOpt.Value.String()}
227+
}
228+
229+
normalizedArchitectures := make([]string, 0)
230+
for _, archStr := range architecturesToValidate {
231+
232+
arch := vm.CPUArch(strings.ToLower(archStr))
233+
234+
if !slices.Contains(validArchitectures, arch) {
235+
printErrAndExit(fmt.Errorf("unsupported architecture %q", archStr))
236+
}
237+
238+
// Canonicalize architecture flag value.
239+
normalizedArchitectures = append(normalizedArchitectures, string(arch))
220240
}
221-
if string(arch) != archOpt.Value.String() {
222-
// Set the canonical value.
223-
_ = cmd.Flags().Set("arch", string(arch))
241+
242+
// Replace the value by accessing the underlying slice directly
243+
// This is a bit hacky but works with pflag
244+
if sliceValue, ok := archOpt.Value.(pflag.SliceValue); ok {
245+
// For StringSliceVar, replace the entire slice
246+
_ = sliceValue.Replace(normalizedArchitectures)
247+
} else {
248+
// For StringVar, just set the first value
249+
_ = cmd.Flags().Set("arch", normalizedArchitectures[0])
224250
}
225251
}
226252

pkg/roachprod/vm/aws/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ go_library(
66
"aws.go",
77
"config.go",
88
"keys.go",
9+
"pre_bake_amis.go",
910
"support.go",
1011
],
1112
embedsrcs = [
@@ -19,6 +20,7 @@ go_library(
1920
"//pkg/roachprod/logger",
2021
"//pkg/roachprod/vm",
2122
"//pkg/roachprod/vm/flagstub",
23+
"//pkg/roachprod/vm/utils/packer",
2224
"//pkg/util/retry",
2325
"//pkg/util/syncutil",
2426
"//pkg/util/timeutil",

0 commit comments

Comments
 (0)