Skip to content

Commit d6edad4

Browse files
committed
roachprod: pre-bake custom roachprod cloud images
Prior to this patch, roachprod clusters were created from bare Ubuntu images. This was inadequate for multiple reasons, some of which being: - dependency on third-parties (GCS, APT repositories) availability - spinning up two clusters at a different moment in time could lead to different resulting systems (package versions, ...) and create reproducibility issues - growing number of dependencies installed increases the boot time To address this, this patch creates a new `roachprod bake-images` command that relies on Hashicorp Packer to pre-bake ready to use cloud images for AWS and GCP. This creates a system dependency on Packer and requires the machine that runs the command to have Packer installed and to be authenticated on AWS and GCP with authorization to create instances and publish new images. If an image already exist, it won't get built again, making re-running `roachprod bake-images` safe. The pre-baking process creates images for `amd64`, `arm64` and `fips`, and pushes them to the roachprod compatible regions (only for AWS, since images are globally available in GCP). The images are tagged with a hashed checksum of the startup script, which defines their unique version. At runtime, the providers checksums the startup script to figure out which pre-baked image should be used, and checks for its availability in the cloud provider for that specific region/zone: - if the image exists, it is used to create the instance, and only a subset (runtime) of the startup scripts is executed on the instances, decreasing the startup time to a minimum (5s or so for disk setup) - if the image does not exists, the system fallbacks to using the base image and the whole startup scripts (pre-baking + runtime) is executed on the instances. Notes: - this patch only contains implementation for AWS and GCP, and Azure and IBM should also be implemented - a CI mechanism should be built to automatically build all images when there is a change in the startup scripts (either Github upon merge to `master` or TeamCity nightly runs) - there is currently no built-in way to deprecate/cleanup previous images since they might still be used on older branches; a cleanup routine should be considered if/when the number of images get out of hand Epic: none Informs: #150144 Release note: None
1 parent 1e00b30 commit d6edad4

File tree

25 files changed

+1797
-398
lines changed

25 files changed

+1797
-398
lines changed

pkg/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1739,6 +1739,7 @@ GO_TARGETS = [
17391739
"//pkg/roachprod/vm/ibm:ibm_test",
17401740
"//pkg/roachprod/vm/local:local",
17411741
"//pkg/roachprod/vm/local:local_test",
1742+
"//pkg/roachprod/vm/utils/packer:packer",
17421743
"//pkg/roachprod/vm:vm",
17431744
"//pkg/roachprod/vm:vm_test",
17441745
"//pkg/roachprod:roachprod",

pkg/cmd/roachprod/cli/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ go_library(
2727
"//pkg/roachprod/ssh",
2828
"//pkg/roachprod/ui",
2929
"//pkg/roachprod/vm",
30+
"//pkg/roachprod/vm/aws",
3031
"//pkg/roachprod/vm/gce",
32+
"//pkg/roachprod/vm/utils/packer",
3133
"//pkg/util/envutil",
3234
"//pkg/util/flagutil",
3335
"//pkg/util/timeutil",

pkg/cmd/roachprod/cli/commands.go

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ import (
2828
"github.com/cockroachdb/cockroach/pkg/roachprod/roachprodutil"
2929
"github.com/cockroachdb/cockroach/pkg/roachprod/ui"
3030
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
31+
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/aws"
3132
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
33+
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/utils/packer"
3234
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
3335
"github.com/cockroachdb/errors"
3436
"github.com/fatih/color"
@@ -2256,3 +2258,127 @@ If a destination is not provided, the certs will be downloaded to a default %s d
22562258
}),
22572259
}
22582260
}
2261+
2262+
func (cr *commandRegistry) buildBakeImageCmd() *cobra.Command {
2263+
var providers []string
2264+
var zones []string
2265+
var project string
2266+
var architectures []string
2267+
2268+
bakeImageCmd := &cobra.Command{
2269+
Use: "bake-images",
2270+
Short: "pre-bake cloud VM images with roachprod configuration",
2271+
Long: `Pre-bake cloud VM images with packages and configuration to speed up cluster creation.
2272+
2273+
This command creates cloud images with all roachprod packages (node_exporter,
2274+
ebpf_exporter, chrony, etc.) pre-installed. When creating clusters with these
2275+
pre-baked images, instance startup time is significantly reduced since packages
2276+
don't need to be downloaded and installed.
2277+
2278+
The image name is generated based on the base Ubuntu image and a checksum of
2279+
the startup template content. If an image with the computed name already exists,
2280+
the command exits successfully without creating a new image.
2281+
2282+
If no provider is specified, images will be built for all supported providers.
2283+
If no architecture is specified, all supported architectures will be built.
2284+
2285+
Examples:
2286+
roachprod bake-image # Build for all providers and architectures
2287+
roachprod bake-image --provider=gce --zones=us-central1-a # GCE only
2288+
roachprod bake-image --provider=aws # AWS only
2289+
roachprod bake-image --provider=gce,aws --arch=amd64,arm64 # Both providers, specific architectures`,
2290+
Args: cobra.NoArgs,
2291+
Run: Wrap(func(cmd *cobra.Command, args []string) error {
2292+
// Default to all providers if none specified
2293+
if len(providers) == 0 {
2294+
providers = []string{"gce", "aws"}
2295+
}
2296+
2297+
// Default to all architectures if none specified
2298+
if len(architectures) == 0 {
2299+
architectures = []string{"amd64", "arm64", "fips"}
2300+
}
2301+
2302+
// Collect sources and provisioners from all providers
2303+
var allSources []packer.SourceConfig
2304+
var allProvisioners []packer.ProvisionerConfig
2305+
var allPlugins []packer.PluginConfig
2306+
2307+
for _, provider := range providers {
2308+
config.Logger.Printf("Preparing images for provider: %s", provider)
2309+
2310+
var sources []packer.SourceConfig
2311+
var provisioners []packer.ProvisionerConfig
2312+
var plugins []packer.PluginConfig
2313+
var err error
2314+
2315+
switch provider {
2316+
case "gce":
2317+
// Get GCE provider instance
2318+
gceProvider, ok := vm.Providers["gce"].(*gce.Provider)
2319+
if !ok {
2320+
return errors.New("GCE provider not initialized")
2321+
}
2322+
2323+
providerOpts := map[string]interface{}{
2324+
"zones": zones,
2325+
"project": project,
2326+
}
2327+
sources, provisioners, plugins, err = gceProvider.GetPackerSources(
2328+
config.Logger, architectures, providerOpts)
2329+
2330+
case "aws":
2331+
// Get AWS provider instance
2332+
awsProvider, ok := vm.Providers["aws"].(*aws.Provider)
2333+
if !ok {
2334+
return errors.New("AWS provider not initialized")
2335+
}
2336+
2337+
providerOpts := map[string]interface{}{}
2338+
sources, provisioners, plugins, err = awsProvider.GetPackerSources(
2339+
config.Logger, architectures, providerOpts)
2340+
2341+
default:
2342+
return errors.Newf("unsupported provider: %s (supported: gce, aws)", provider)
2343+
}
2344+
2345+
if err != nil {
2346+
return errors.Wrapf(err, "failed to get Packer sources for %s", provider)
2347+
}
2348+
2349+
// Accumulate sources, provisioners, and plugins
2350+
allSources = append(allSources, sources...)
2351+
allProvisioners = append(allProvisioners, provisioners...)
2352+
allPlugins = append(allPlugins, plugins...)
2353+
}
2354+
2355+
// If no sources to build, we're done
2356+
if len(allSources) == 0 {
2357+
config.Logger.Printf("All requested images already exist, nothing to build")
2358+
return nil
2359+
}
2360+
2361+
// Build all images in a single Packer run (parallel across providers!)
2362+
config.Logger.Printf("Building images across %d provider(s) in parallel...", len(providers))
2363+
if err := packer.Build(config.Logger, allSources, allProvisioners, allPlugins); err != nil {
2364+
return errors.Wrap(err, "packer build failed")
2365+
}
2366+
2367+
config.Logger.Printf("Successfully baked images for all requested providers")
2368+
return nil
2369+
}),
2370+
}
2371+
2372+
bakeImageCmd.Flags().StringSliceVar(&providers, "provider", nil,
2373+
"cloud provider(s) to build images for (gce, aws); if not specified, builds for all")
2374+
bakeImageCmd.Flags().StringSliceVar(&zones, "zones", []string{"us-central1-a"},
2375+
"zones to build the image in (GCE only, uses first zone)")
2376+
bakeImageCmd.Flags().StringVar(&project, "project", gce.DefaultProject(),
2377+
"GCE project to create the image in (GCE only)")
2378+
bakeImageCmd.Flags().StringSliceVar(&architectures, "arch", nil,
2379+
"architectures to build for (amd64, arm64, fips); if not specified, builds all")
2380+
2381+
cr.addToExcludeFromBashCompletion(bakeImageCmd)
2382+
cr.addToExcludeFromClusterFlagsMulti(bakeImageCmd)
2383+
return bakeImageCmd
2384+
}

pkg/cmd/roachprod/cli/resgistry.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,5 +73,6 @@ func (cr *commandRegistry) register() {
7373
cr.buildFetchLogsCmd(),
7474
cr.buildGetLatestPProfCmd(),
7575
cr.buildFetchCertsDir(),
76+
cr.buildBakeImageCmd(),
7677
})
7778
}

pkg/roachprod/vm/aws/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ go_library(
66
"aws.go",
77
"config.go",
88
"keys.go",
9+
"pre_bake_amis.go",
910
"support.go",
1011
],
1112
embedsrcs = [
@@ -19,6 +20,7 @@ go_library(
1920
"//pkg/roachprod/logger",
2021
"//pkg/roachprod/vm",
2122
"//pkg/roachprod/vm/flagstub",
23+
"//pkg/roachprod/vm/utils/packer",
2224
"//pkg/util/retry",
2325
"//pkg/util/syncutil",
2426
"//pkg/util/timeutil",

0 commit comments

Comments
 (0)