From 2689e69fb771ce2e7b30ee500e0363df93339fb5 Mon Sep 17 00:00:00 2001 From: Spencer Kim Date: Tue, 3 Feb 2026 08:35:13 -0800 Subject: [PATCH] delete PiperOrigin-RevId: 864894124 --- CHANGES.next.md | 1 + perfkitbenchmarker/cloud_tpu.py | 79 +----- .../configs/benchmark_config_spec.py | 33 +-- .../linux_benchmarks/mlperf_benchmark.py | 255 +----------------- .../linux_benchmarks/mnist_benchmark.py | 131 ++------- .../provision_tpu_benchmark.py | 83 ++++++ perfkitbenchmarker/providers/gcp/gcp_tpu.py | 75 +++--- tests/cloud_tpu_test.py | 93 ++----- tests/providers/gcp/gcp_tpu_test.py | 51 +--- 9 files changed, 198 insertions(+), 603 deletions(-) create mode 100644 perfkitbenchmarker/linux_benchmarks/provisioning_benchmarks/provision_tpu_benchmark.py diff --git a/CHANGES.next.md b/CHANGES.next.md index 0d54e13eb1..fb90ec7cee 100644 --- a/CHANGES.next.md +++ b/CHANGES.next.md @@ -641,3 +641,4 @@ - Added --ycsb_load_max_error_rates and --ycsb_run_max_error_rates as a more general solution that can be used with or instead of --ycsb_max_error_rate and --ycsb_fail_on_incomplete_loading. +- Support basic TPU vm provisioning. diff --git a/perfkitbenchmarker/cloud_tpu.py b/perfkitbenchmarker/cloud_tpu.py index 9a8f06154b..9f694f97ef 100644 --- a/perfkitbenchmarker/cloud_tpu.py +++ b/perfkitbenchmarker/cloud_tpu.py @@ -13,47 +13,17 @@ # limitations under the License. """Module containing class for TPU.""" -import abc - from absl import flags from perfkitbenchmarker import resource -flags.DEFINE_string( - 'tpu_cidr_range', - None, - """CIDR Range for the TPU. The IP - range that the TPU will select an IP address from. Must be - in CIDR notation and a /29 range, for example - 192.168.0.0/29. Errors will occur if the CIDR range has - already been used for a currently existing TPU, the CIDR - range conflicts with any networks in the user's provided - network, or the provided network is peered with another - network that is using that CIDR range.""", -) -flags.DEFINE_string( - 'tpu_accelerator_type', 'tpu-v2', 'TPU accelerator type for the TPU.' -) -flags.DEFINE_string( - 'tpu_description', None, 'Specifies a text description of the TPU.' -) -flags.DEFINE_string( - 'tpu_network', - None, - 'Specifies the network that this TPU will be a part of.', -) +flags.DEFINE_string('tpu_name', None, 'The name of the TPU to create.') +flags.DEFINE_string('tpu_type', None, 'TPU type for the TPU.') +flags.DEFINE_string('tpu_topology', None, 'TPU topology for the TPU.') flags.DEFINE_string('tpu_tf_version', None, 'TensorFlow version for the TPU.') flags.DEFINE_string( 'tpu_zone', None, 'The zone of the tpu to create. Zone in which TPU lives.' ) -flags.DEFINE_string('tpu_name', None, 'The name of the TPU to create.') -flags.DEFINE_boolean('tpu_preemptible', False, 'Use preemptible TPU or not.') -flags.DEFINE_integer( - 'tpu_cores_per_donut', - 8, - 'The number of cores per TPU donut. This is 8 because each' - ' TPU has 4 chips each with 2 cores.', -) FLAGS = flags.FLAGS @@ -86,49 +56,16 @@ def __init__(self, tpu_spec): """ super().__init__() self.spec = tpu_spec - - def _Create(self): - """Creates the TPU.""" - raise NotImplementedError() - - def _Delete(self): - """Deletes the TPU.""" - raise NotImplementedError() - - @abc.abstractmethod - def GetName(self): - raise NotImplementedError() - - @abc.abstractmethod - def GetMasterGrpcAddress(self): - """Gets the master grpc address of the TPU.""" - raise NotImplementedError() - - @abc.abstractmethod - def GetNumShards(self): - """Gets the number of TPU shards.""" - raise NotImplementedError() - - @abc.abstractmethod - def GetZone(self): - """Gets the TPU zone.""" - raise NotImplementedError() - - @abc.abstractmethod - def GetAcceleratorType(self): - """Gets the TPU accelerator type.""" - raise NotImplementedError() + self.create_start_time = -1 + self.create_time = -1 def GetResourceMetadata(self): """Returns a dictionary of cluster metadata.""" metadata = { - 'cidr_range': self.spec.tpu_cidr_range, - 'accelerator_type': self.spec.tpu_accelerator_type, - 'description': self.spec.tpu_description, - 'network': self.spec.tpu_network, + 'name': self.spec.tpu_name, + 'tpu_type': self.spec.tpu_type, + 'tpu_topology': self.spec.tpu_topology, 'tf_version': self.spec.tpu_tf_version, 'zone': self.spec.tpu_zone, - 'name': self.spec.tpu_name, - 'preemptible': self.spec.tpu_preemptible, } return metadata diff --git a/perfkitbenchmarker/configs/benchmark_config_spec.py b/perfkitbenchmarker/configs/benchmark_config_spec.py index d7219eb525..1f4628b4fc 100644 --- a/perfkitbenchmarker/configs/benchmark_config_spec.py +++ b/perfkitbenchmarker/configs/benchmark_config_spec.py @@ -471,29 +471,20 @@ def _GetOptionDecoderConstructions(cls): option_decoders.EnumDecoder, {'valid_values': provider_info.VALID_CLOUDS}, ), - 'tpu_cidr_range': ( - option_decoders.StringDecoder, - {'default': None}, - ), - 'tpu_accelerator_type': ( + 'tpu_name': (option_decoders.StringDecoder, {'default': None}), + 'tpu_type': ( option_decoders.StringDecoder, {'default': None}, ), - 'tpu_description': ( + 'tpu_topology': ( option_decoders.StringDecoder, {'default': None}, ), - 'tpu_network': (option_decoders.StringDecoder, {'default': None}), 'tpu_tf_version': ( option_decoders.StringDecoder, {'default': None}, ), 'tpu_zone': (option_decoders.StringDecoder, {'default': None}), - 'tpu_name': (option_decoders.StringDecoder, {'default': None}), - 'tpu_preemptible': ( - option_decoders.BooleanDecoder, - {'default': False}, - ), }) return result @@ -512,22 +503,16 @@ def _ApplyFlags(cls, config_values, flag_values): super()._ApplyFlags(config_values, flag_values) if flag_values['cloud'].present: config_values['cloud'] = flag_values.cloud - if flag_values['tpu_cidr_range'].present: - config_values['tpu_cidr_range'] = flag_values.tpu_cidr_range - if flag_values['tpu_accelerator_type'].present: - config_values['tpu_accelerator_type'] = flag_values.tpu_accelerator_type - if flag_values['tpu_description'].present: - config_values['tpu_description'] = flag_values.tpu_description - if flag_values['tpu_network'].present: - config_values['tpu_network'] = flag_values.tpu_network + if flag_values['tpu_name'].present: + config_values['tpu_name'] = flag_values.tpu_name + if flag_values['tpu_type'].present: + config_values['tpu_type'] = flag_values.tpu_type + if flag_values['tpu_topology'].present: + config_values['tpu_topology'] = flag_values.tpu_topology if flag_values['tpu_tf_version'].present: config_values['tpu_tf_version'] = flag_values.tpu_tf_version if flag_values['tpu_zone'].present: config_values['tpu_zone'] = flag_values.tpu_zone - if flag_values['tpu_name'].present: - config_values['tpu_name'] = flag_values.tpu_name - if flag_values['tpu_preemptible'].present: - config_values['tpu_preemptible'] = flag_values.tpu_preemptible class _EdwServiceDecoder(option_decoders.TypeVerifier): diff --git a/perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py b/perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py index 5b115b7644..3c0c7e0367 100644 --- a/perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py @@ -28,8 +28,6 @@ from perfkitbenchmarker.linux_packages import google_cloud_sdk from perfkitbenchmarker.linux_packages import nvidia_driver from perfkitbenchmarker.linux_packages import tensorflow -from perfkitbenchmarker.providers.gcp import gcs -from perfkitbenchmarker.providers.gcp import util FLAGS = flags.FLAGS @@ -277,189 +275,7 @@ def PrepareRunner(benchmark_spec, vm=None): """ vm = vm or benchmark_spec.vms[0] if benchmark_spec.tpus: - if vm == benchmark_spec.vms[0]: - storage_service = gcs.GoogleCloudStorageService() - benchmark_spec.storage_service = storage_service - if FLAGS.mlperf_bucket: - bucket = FLAGS.mlperf_bucket - benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}' - else: - bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri) - benchmark_spec.model_dir = f'gs://{bucket}' - - benchmark_spec.bucket = bucket - location = benchmark_spec.tpu_groups['train'].GetZone() - storage_service.PrepareService(util.GetRegionFromZone(location)) - storage_service.MakeBucket(bucket) - storage_service.AclBucket( - benchmark_spec.gcp_service_account, gcs.WRITER, bucket - ) - - # For MLPerf 1.0, the benchmake code of different hardware are different. - if ( - benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048' - ): - run_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'.format( - version=VERSION.value, - model=benchmark_spec.benchmark, - tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(), - ) - else: - raise ValueError( - 'MLPerf configurations do not support the hardware in PKB. PKB may ' - 'need to be updated if this is a new TPU type.' - ) - - if MASK in benchmark_spec.benchmark: - model = 'mask_rcnn' - elif GNMT in benchmark_spec.benchmark: - model = 'nmt' - else: - model = benchmark_spec.benchmark - - code_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'.format( - version=VERSION.value, - model=benchmark_spec.benchmark, - tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(), - ) - - vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ') - vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12') - if MASK in benchmark_spec.benchmark or SSD in benchmark_spec.benchmark: - # Install the coco package, to load the coco dataset for Mask-RCNN - # and SSD benchmarks. - # TODO(user): coco whl package for python 3.5 - vm.RemoteCommand( - 'cd /tmp && wget https://storage.cloud.google.com/' - 'mlperf_artifcats/v0.6_training/' - 'coco-1.1-cp36-cp36m-linux_x86_64.whl' # NOTYPO - ) - - setup_script = posixpath.join(run_path, 'setup.sh') - vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script) - vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script) - vm.RemoteCommand( - 'chmod 755 {script} && {script}'.format(script=setup_script) - ) - - if MASK not in benchmark_spec.benchmark: - vm.RemoteCommand( - 'pip3 uninstall -y tf-estimator-nightly && ' - 'pip3 install tf-estimator-nightly==1.14.0.dev2019051801' - ) - - if RESNET in benchmark_spec.benchmark: - data_dir = benchmark_spec.imagenet_data_dir - elif TRANSFORMER in benchmark_spec.benchmark: - data_dir = benchmark_spec.wmt_data_dir - elif MASK in benchmark_spec.benchmark: - data_dir = benchmark_spec.coco_data_dir - elif GNMT in benchmark_spec.benchmark: - data_dir = benchmark_spec.gnmt_data_dir - elif SSD in benchmark_spec.benchmark: - data_dir = benchmark_spec.coco_data_dir - elif BERT in benchmark_spec.benchmark: - data_dir = benchmark_spec.bert_data_dir - else: - raise ValueError( - 'Unknown operation, cannot find {} in benchmark'.format( - benchmark_spec.benchmark - ) - ) - - run_script = posixpath.join(run_path, 'run_and_time.sh') - data_dir = data_dir.replace('/', r'\/') - checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/') - decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/') - tpu = benchmark_spec.tpu_groups['train'].GetName() - vm_util.ReplaceText( - vm, - '--model_dir=.*', - r'--model_dir=gs:\/\/{} \\\\'.format(bucket), - run_script, - ) - vm_util.ReplaceText( - vm, '--data_dir=.*', r'--data_dir={} \\\\'.format(data_dir), run_script - ) - vm_util.ReplaceText( - vm, - '--training_file_pattern=.*', - r'--training_file_pattern={}\/train-* \\\\'.format(data_dir), - run_script, - ) - vm_util.ReplaceText( - vm, - '--validation_file_pattern=.*', - r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir), - run_script, - ) - vm_util.ReplaceText( - vm, - '--val_json_file=.*', - r'--val_json_file={}\/instances_val2017.json \\\\'.format(data_dir), - run_script, - ) - vm_util.ReplaceText( - vm, - '--resnet_checkpoint=.*', - r'--resnet_checkpoint={} \\\\'.format(checkpoint), - run_script, - ) - vm_util.ReplaceText( - vm, - '--decode_from_file=.*', - r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir), - run_script, - ) - vm_util.ReplaceText( - vm, - '--decode_reference=.*', - r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir), - run_script, - ) - vm_util.ReplaceText( - vm, - '--decode_to_file=.*', - r'--decode_to_file={}\/decode.transformer_mlperf_tpu.' - r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket), - run_script, - ) - vm_util.ReplaceText( - vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu), run_script - ) - vm_util.ReplaceText( - vm, - '--output_dir=.*', - r'--output_dir=gs:\/\/{} \\\\'.format(bucket), - run_script, - ) - vm_util.ReplaceText( - vm, - '--cloud_tpu_name=.*', - r'--cloud_tpu_name={} \\\\'.format(tpu), - run_script, - ) - vm_util.ReplaceText( - vm, - '--out_dir=.*', - r'--out_dir=gs:\/\/{} \\\\'.format(bucket), - run_script, - ) - vm_util.ReplaceText( - vm, '--tpu_name=.*', r'--tpu_name={} \\\\'.format(tpu), run_script - ) - vm.RemoteCommand('chmod 755 {}'.format(run_script)) - - if GNMT in benchmark_spec.benchmark: - metric_script = posixpath.join(code_path, model, 'metric.py') - vm_util.ReplaceText( - vm, ' sacrebleu -t', ' python3 -m sacrebleu -t', metric_script - ) + raise errors.Config.InvalidValue('MLPerf benchmark does not support TPUs.') else: benchmark_spec.model_dir = '/tmp' @@ -757,7 +573,6 @@ def _CreateMetadataDict(benchmark_spec): metadata dict """ metadata = { - 'use_tpu': bool(benchmark_spec.tpus), 'model_dir': benchmark_spec.model_dir, 'model': benchmark_spec.benchmark, 'version': VERSION.value, @@ -769,15 +584,6 @@ def _CreateMetadataDict(benchmark_spec): total_gpus = gpus_per_node * num_vms metadata.update(cuda_toolkit.GetMetadata(vm)) metadata['total_gpus'] = total_gpus - if benchmark_spec.tpus: - metadata.update({ - 'train_tpu_num_shards': benchmark_spec.tpu_groups[ - 'train' - ].GetNumShards(), - 'train_tpu_accelerator_type': benchmark_spec.tpu_groups[ - 'train' - ].GetAcceleratorType(), - }) return metadata @@ -891,55 +697,7 @@ def Run(benchmark_spec): _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: - # For MLPerf 1.0, the benchmake code of different hardware are different. - if ( - benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' - or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048' - ): - run_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'.format( - version=VERSION.value, - model=benchmark_spec.benchmark, - tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(), - ) - code_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'.format( - version=VERSION.value, - model=benchmark_spec.benchmark, - tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(), - ) - - if MASK in benchmark_spec.benchmark: - model = 'mask_rcnn' - elif GNMT in benchmark_spec.benchmark: - model = 'nmt' - else: - model = benchmark_spec.benchmark - - mlperf_benchmark_cmd = ( - 'cd {code_path} && ' - 'export PYTHONPATH=$(pwd):$(pwd)/{model} && ' - 'cd {model} && ' - '{run_path}/run_and_time.sh'.format( - code_path=code_path, model=model, run_path=run_path - ) - ) - - if SSD in benchmark_spec.benchmark: - mlperf_benchmark_cmd = ( - 'export MLP_GCS_RESNET_CHECKPOINT={checkpoint} && {cmd}'.format( - checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint, - cmd=mlperf_benchmark_cmd, - ) - ) - else: - raise ValueError( - 'MLPerf configurations do not support the hardware in PKB. PKB may ' - 'need to be updated if this is a new TPU type.' - ) - + raise errors.Config.InvalidValue('MLPerf benchmark does not support TPUs.') else: run_sub_paths = { RESNET: 'resnet/implementations/mxnet', @@ -1002,12 +760,7 @@ def Run(benchmark_spec): return samples -def Cleanup(benchmark_spec): +def Cleanup(_): """Cleanup MLPerf on the cluster. - - Args: - benchmark_spec: The benchmark specification. Contains all data that is - required to run the benchmark. """ - if benchmark_spec.tpus and FLAGS.mlperf_bucket is None: - benchmark_spec.storage_service.DeleteBucket(benchmark_spec.bucket) + pass diff --git a/perfkitbenchmarker/linux_benchmarks/mnist_benchmark.py b/perfkitbenchmarker/linux_benchmarks/mnist_benchmark.py index e5a0ff7110..f1c7b27063 100644 --- a/perfkitbenchmarker/linux_benchmarks/mnist_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/mnist_benchmark.py @@ -23,8 +23,6 @@ from perfkitbenchmarker.linux_packages import cloud_tpu_models from perfkitbenchmarker.linux_packages import nvidia_driver from perfkitbenchmarker.linux_packages import tensorflow -from perfkitbenchmarker.providers.gcp import gcs -from perfkitbenchmarker.providers.gcp import util FLAGS = flags.FLAGS @@ -146,24 +144,10 @@ def Prepare(benchmark_spec): benchmark_spec.always_call_cleanup = True _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] - if not benchmark_spec.tpus: - vm.Install('tensorflow') + vm.Install('tensorflow') vm.Install('cloud_tpu_models') vm.Install('tensorflow_models') - if benchmark_spec.tpus: - storage_service = gcs.GoogleCloudStorageService() - benchmark_spec.storage_service = storage_service - bucket = 'pkb{}'.format(FLAGS.run_uri) - benchmark_spec.bucket = bucket - benchmark_spec.model_dir = 'gs://{}'.format(bucket) - location = benchmark_spec.tpu_groups['train'].GetZone() - storage_service.PrepareService(util.GetRegionFromZone(location)) - storage_service.MakeBucket(bucket) - storage_service.AclBucket( - benchmark_spec.gcp_service_account, gcs.WRITER, bucket - ) - else: - benchmark_spec.model_dir = '/tmp' + benchmark_spec.model_dir = '/tmp' def CreateMetadataDict(benchmark_spec): @@ -177,7 +161,6 @@ def CreateMetadataDict(benchmark_spec): metadata dict """ metadata = { - 'use_tpu': bool(benchmark_spec.tpus), 'data_dir': benchmark_spec.data_dir, 'model_dir': benchmark_spec.model_dir, 'train_steps': benchmark_spec.train_steps, @@ -192,15 +175,6 @@ def CreateMetadataDict(benchmark_spec): 'train_batch_size': benchmark_spec.batch_size, 'eval_batch_size': benchmark_spec.batch_size, } - if benchmark_spec.tpus: - metadata.update({ - 'train_tpu_num_shards': benchmark_spec.tpu_groups[ - 'train' - ].GetNumShards(), - 'train_tpu_accelerator_type': benchmark_spec.tpu_groups[ - 'train' - ].GetAcceleratorType(), - }) return metadata @@ -327,40 +301,21 @@ def Run(benchmark_spec): _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] - if benchmark_spec.tpus: - mnist_benchmark_script = 'mnist_tpu.py' - mnist_benchmark_cmd = ( - 'cd tpu/models && ' - 'export PYTHONPATH=$(pwd) && ' - 'cd official/mnist && ' - 'python {script} ' - '--data_dir={data_dir} ' - '--iterations={iterations} ' - '--model_dir={model_dir} ' - '--batch_size={batch_size}'.format( - script=mnist_benchmark_script, - data_dir=benchmark_spec.data_dir, - iterations=benchmark_spec.iterations, - model_dir=benchmark_spec.model_dir, - batch_size=benchmark_spec.batch_size, - ) - ) - else: - mnist_benchmark_script = 'mnist.py' - mnist_benchmark_cmd = ( - 'cd models && ' - 'export PYTHONPATH=$(pwd) && ' - 'cd official/mnist && ' - 'python {script} ' - '--data_dir={data_dir} ' - '--model_dir={model_dir} ' - '--batch_size={batch_size} '.format( - script=mnist_benchmark_script, - data_dir=benchmark_spec.data_dir, - model_dir=benchmark_spec.model_dir, - batch_size=benchmark_spec.batch_size, - ) - ) + mnist_benchmark_script = 'mnist.py' + mnist_benchmark_cmd = ( + 'cd models && ' + 'export PYTHONPATH=$(pwd) && ' + 'cd official/mnist && ' + 'python {script} ' + '--data_dir={data_dir} ' + '--model_dir={model_dir} ' + '--batch_size={batch_size} '.format( + script=mnist_benchmark_script, + data_dir=benchmark_spec.data_dir, + model_dir=benchmark_spec.model_dir, + batch_size=benchmark_spec.batch_size, + ) + ) if nvidia_driver.CheckNvidiaGpuExists(vm): mnist_benchmark_cmd = '{env} {cmd}'.format( @@ -370,29 +325,9 @@ def Run(benchmark_spec): metadata = CreateMetadataDict(benchmark_spec) if benchmark_spec.train_steps > 0: - if benchmark_spec.tpus: - tpu = benchmark_spec.tpu_groups['train'].GetName() - num_shards = '--num_shards={}'.format( - benchmark_spec.tpu_groups['train'].GetNumShards() - ) - else: - tpu = num_shards = '' - - if benchmark_spec.tpus: - mnist_benchmark_train_cmd = ( - '{cmd} --tpu={tpu} --use_tpu={use_tpu} --train_steps={train_steps} ' - '{num_shards} --noenable_predict'.format( - cmd=mnist_benchmark_cmd, - tpu=tpu, - use_tpu=bool(benchmark_spec.tpus), - train_steps=benchmark_spec.train_steps, - num_shards=num_shards, - ) - ) - else: - mnist_benchmark_train_cmd = '{cmd} --train_epochs={train_epochs} '.format( - cmd=mnist_benchmark_cmd, train_epochs=benchmark_spec.train_epochs - ) + mnist_benchmark_train_cmd = '{cmd} --train_epochs={train_epochs} '.format( + cmd=mnist_benchmark_cmd, train_epochs=benchmark_spec.train_epochs + ) start = time.time() stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_train_cmd) @@ -407,20 +342,9 @@ def Run(benchmark_spec): ) if benchmark_spec.eval_steps > 0: - if benchmark_spec.tpus: - mnist_benchmark_eval_cmd = ( - '{cmd} --tpu={tpu} --use_tpu={use_tpu} --eval_steps={eval_steps}' - .format( - cmd=mnist_benchmark_cmd, - use_tpu=bool(benchmark_spec.tpus), - tpu=benchmark_spec.tpu_groups['eval'].GetName(), - eval_steps=benchmark_spec.eval_steps, - ) - ) - else: - mnist_benchmark_eval_cmd = '{cmd} --eval_steps={eval_steps}'.format( - cmd=mnist_benchmark_cmd, eval_steps=benchmark_spec.eval_steps - ) + mnist_benchmark_eval_cmd = '{cmd} --eval_steps={eval_steps}'.format( + cmd=mnist_benchmark_cmd, eval_steps=benchmark_spec.eval_steps + ) stdout, stderr = vm.RobustRemoteCommand(mnist_benchmark_eval_cmd) samples.extend( @@ -429,12 +353,7 @@ def Run(benchmark_spec): return samples -def Cleanup(benchmark_spec): +def Cleanup(_): """Cleanup MNIST on the cluster. - - Args: - benchmark_spec: The benchmark specification. Contains all data that is - required to run the benchmark. """ - if benchmark_spec.tpus: - benchmark_spec.storage_service.DeleteBucket(benchmark_spec.bucket) + pass diff --git a/perfkitbenchmarker/linux_benchmarks/provisioning_benchmarks/provision_tpu_benchmark.py b/perfkitbenchmarker/linux_benchmarks/provisioning_benchmarks/provision_tpu_benchmark.py new file mode 100644 index 0000000000..2c517e66e7 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/provisioning_benchmarks/provision_tpu_benchmark.py @@ -0,0 +1,83 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Benchmark for timing provisioning for TPU VMs. + +This measures the time it takes to create the TPU. +Example usage: +pkb.py --benchmarks=provision_tpu --tpu_type=v6e --tpu_topology=1x1 +""" + +import time +from typing import List + +from absl import flags +from perfkitbenchmarker import benchmark_spec +from perfkitbenchmarker import configs +from perfkitbenchmarker import sample + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = 'provision_tpu' +BENCHMARK_CONFIG = """ +provision_tpu: + description: > + Time spinning up a TPU VMs. + tpu_groups: + default: + cloud: GCP + tpu_type: v6e + tpu_topology: 1x1 + tpu_tf_version: v2-alpha-tpuv6e + tpu_zone: europe-west4-a +""" + + +def GetConfig(user_config): + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(_): + pass + + +def CheckPrerequisites(_): + """Perform flag checks.""" + pass + + +def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> List[sample.Sample]: + """Runs the benchmark.""" + tpu_vm = bm_spec.tpu_groups['default'] + metadata = tpu_vm.GetResourceMetadata() + tpu_vm.WaitForSshBecameReady() + time_to_ssh = time.time() - tpu_vm.create_start_time + return [ + sample.Sample( + 'Time to Create TPU', + tpu_vm.create_time, + 'seconds', + metadata, + ), + sample.Sample( + 'Time From Create to SSH', + time_to_ssh, + 'seconds', + metadata, + ), + ] + + +def Cleanup(_): + pass diff --git a/perfkitbenchmarker/providers/gcp/gcp_tpu.py b/perfkitbenchmarker/providers/gcp/gcp_tpu.py index 3a31367923..1e4c344d5e 100644 --- a/perfkitbenchmarker/providers/gcp/gcp_tpu.py +++ b/perfkitbenchmarker/providers/gcp/gcp_tpu.py @@ -1,4 +1,4 @@ -# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,10 +18,12 @@ import json import logging +import time from absl import flags from perfkitbenchmarker import cloud_tpu from perfkitbenchmarker import errors from perfkitbenchmarker import provider_info +from perfkitbenchmarker import vm_util from perfkitbenchmarker.providers.gcp import util FLAGS = flags.FLAGS @@ -47,29 +49,27 @@ class GcpTpu(cloud_tpu.BaseTpu): def __init__(self, tpu_spec): super().__init__(tpu_spec) - self.spec = tpu_spec self.project = FLAGS.project or util.GetDefaultProject() def _Create(self): """Create Cloud TPU.""" - cmd = util.GcloudCommand( - self, 'compute', 'tpus', 'create', self.spec.tpu_name - ) - cmd.flags['range'] = self.spec.tpu_cidr_range - if self.spec.tpu_accelerator_type: - cmd.flags['accelerator-type'] = self.spec.tpu_accelerator_type - if self.spec.tpu_description: - cmd.flags['description'] = self.spec.tpu_description - if self.spec.tpu_network: - cmd.flags['network'] = self.spec.tpu_network + is_v6e = self.spec.tpu_type and self.spec.tpu_type.startswith('v6e') + components = ['compute', 'tpus', 'tpu-vm', 'create', self.spec.tpu_name] + if is_v6e: + components = ['alpha'] + components + cmd = util.GcloudCommand(self, *components) + if self.spec.tpu_type: + cmd.flags['type'] = self.spec.tpu_type + if self.spec.tpu_topology: + cmd.flags['topology'] = self.spec.tpu_topology if self.spec.tpu_tf_version: cmd.flags['version'] = self.spec.tpu_tf_version if self.spec.tpu_zone: cmd.flags['zone'] = self.spec.tpu_zone - if self.spec.tpu_preemptible: - cmd.flags['preemptible'] = self.spec.tpu_preemptible cmd.flags['project'] = self.project + self.create_start_time = time.time() _, stderr, retcode = cmd.Issue(raise_on_failure=False) + self.create_time = time.time() - self.create_start_time if _INSUFFICIENT_CAPACITY in stderr: logging.error(util.STOCKOUT_MESSAGE) @@ -82,9 +82,10 @@ def _Create(self): def _Delete(self): """Deletes the cloud TPU.""" - cmd = util.GcloudCommand( - self, 'compute', 'tpus', 'delete', self.spec.tpu_name - ) + components = ['compute', 'tpus', 'tpu-vm', 'delete', self.spec.tpu_name] + if self.spec.tpu_type and self.spec.tpu_type.startswith('v6e'): + components = ['alpha'] + components + cmd = util.GcloudCommand(self, *components) if self.spec.tpu_zone: cmd.flags['zone'] = self.spec.tpu_zone cmd.flags['project'] = self.project @@ -96,12 +97,14 @@ def _Delete(self): def _GetTpuDescription(self): """Gets the cloud TPU description.""" - cmd = util.GcloudCommand( - self, 'compute', 'tpus', 'describe', self.spec.tpu_name - ) + components = ['compute', 'tpus', 'tpu-vm', 'describe', self.spec.tpu_name] + if self.spec.tpu_type and self.spec.tpu_type.startswith('v6e'): + components = ['alpha'] + components + cmd = util.GcloudCommand(self, *components) if self.spec.tpu_zone: cmd.flags['zone'] = self.spec.tpu_zone cmd.flags['project'] = self.project + cmd.flags['format'] = 'json' stdout, _, retcode = cmd.Issue(raise_on_failure=False) if retcode != 0: logging.info('Could not found GCP cloud TPU %s.', self.spec.tpu_name) @@ -112,25 +115,19 @@ def _Exists(self): _, retcode = self._GetTpuDescription() return retcode == 0 - def GetName(self): - """Gets the name of the cloud TPU.""" - return self.spec.tpu_name + def SshCommand(self, command): + cmd = [FLAGS.gcloud_path] + if self.spec.tpu_type and self.spec.tpu_type.startswith('v6e'): + cmd.append('alpha') + cmd.extend(['compute', 'tpus', 'tpu-vm', 'ssh', self.spec.tpu_name]) + cmd.extend(['--zone', self.spec.tpu_zone]) + cmd.extend(['--project', self.project]) + cmd.extend(['--', command]) + return vm_util.IssueCommand(cmd, timeout=60) - def GetMasterGrpcAddress(self): - """Gets the grpc address of the 0th NetworkEndpoint.""" - master_network_endpoint = self._GetTpuDescription()[0]['networkEndpoints'][ - 0 - ] - - return 'grpc://{ip_address}:{port}'.format( - ip_address=master_network_endpoint['ipAddress'], - port=master_network_endpoint['port'], - ) - - def GetNumShards(self): - """Gets the number of TPU shards.""" - num_tpus = len(self._GetTpuDescription()[0]['networkEndpoints']) - return num_tpus * FLAGS.tpu_cores_per_donut + @vm_util.Retry(poll_interval=1, timeout=600, log_errors=False) + def WaitForSshBecameReady(self): + self.SshCommand('hostname') def GetZone(self): """Gets the TPU zone.""" @@ -138,7 +135,7 @@ def GetZone(self): def GetAcceleratorType(self): """Gets the TPU accelerator type.""" - return self.spec.tpu_accelerator_type + return self.spec.tpu_type def GetResourceMetadata(self): """Returns the metadata associated with the resource. diff --git a/tests/cloud_tpu_test.py b/tests/cloud_tpu_test.py index 70d1c0e4ee..7396f25f36 100644 --- a/tests/cloud_tpu_test.py +++ b/tests/cloud_tpu_test.py @@ -42,21 +42,6 @@ def _Create(self): def _Delete(self): pass - def GetName(self): - pass - - def GetMasterGrpcAddress(self): - pass - - def GetNumShards(self): - pass - - def GetZone(self): - pass - - def GetAcceleratorType(self): - pass - class TpuSpecTestCase(pkb_common_test_case.PkbCommonTestCase): @@ -93,57 +78,31 @@ def testCustomTpuName(self): ) self.assertEqual(result.tpu_name, 'pkb-tpu') - def testDefaultTpuCidrRange(self): - result = benchmark_config_spec._TpuGroupSpec( - _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.minimal_spec - ) - self.assertEqual(result.tpu_cidr_range, None) - - def testCustomTpuCidrRange(self): - spec = MergeDicts(self.minimal_spec, {'tpu_cidr_range': '192.168.0.0/29'}) - result = benchmark_config_spec._TpuGroupSpec( - _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **spec - ) - self.assertEqual(result.tpu_cidr_range, '192.168.0.0/29') - - def testDefaultTpuAcceleratorType(self): - result = benchmark_config_spec._TpuGroupSpec( - _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.minimal_spec - ) - self.assertEqual(result.tpu_accelerator_type, None) - - def testCustomTpuAcceleratorType(self): - spec = MergeDicts(self.minimal_spec, {'tpu_accelerator_type': 'tpu-v2'}) - result = benchmark_config_spec._TpuGroupSpec( - _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **spec - ) - self.assertEqual(result.tpu_accelerator_type, 'tpu-v2') - - def testDefaultTpuDescription(self): + def testDefaultTpuType(self): result = benchmark_config_spec._TpuGroupSpec( _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.minimal_spec ) - self.assertEqual(result.tpu_description, None) + self.assertIsNone(result.tpu_type) - def testCustomTpuDescription(self): - spec = MergeDicts(self.minimal_spec, {'tpu_description': 'My TF Node'}) + def testCustomTpuType(self): + spec = MergeDicts(self.minimal_spec, {'tpu_type': 'v6e'}) result = benchmark_config_spec._TpuGroupSpec( _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **spec ) - self.assertEqual(result.tpu_description, 'My TF Node') + self.assertEqual(result.tpu_type, 'v6e') - def testDefaultTpuNetwork(self): + def testDefaultTpuTopology(self): result = benchmark_config_spec._TpuGroupSpec( _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.minimal_spec ) - self.assertEqual(result.tpu_network, None) + self.assertIsNone(result.tpu_topology) - def testCustomTpuNetwork(self): - spec = MergeDicts(self.minimal_spec, {'tpu_network': 'default'}) + def testCustomTpuTopology(self): + spec = MergeDicts(self.minimal_spec, {'tpu_topology': '1x1'}) result = benchmark_config_spec._TpuGroupSpec( _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **spec ) - self.assertEqual(result.tpu_network, 'default') + self.assertEqual(result.tpu_topology, '1x1') def testDefaultTpuZone(self): result = benchmark_config_spec._TpuGroupSpec( @@ -181,10 +140,8 @@ def setUp(self): self.full_spec = { 'cloud': 'GCP', 'tpu_name': 'pkb-tpu-123', - 'tpu_cidr_range': '192.168.0.0/29', - 'tpu_accelerator_type': 'tpu-v2', - 'tpu_description': 'My TF Node', - 'tpu_network': 'default', + 'tpu_type': 'v5', + 'tpu_topology': '2x2', 'tpu_tf_version': 'nightly', 'tpu_zone': 'us-central1-a', } @@ -205,33 +162,19 @@ def testTpuNameFlag(self): ) self.assertEqual(result.tpu_name, 'pkb-tpu') - def testTpuCidrRangeFlag(self): - FLAGS['tpu_cidr_range'].parse('10.240.0.0/29') - result = benchmark_config_spec._TpuGroupSpec( - _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.full_spec - ) - self.assertEqual(result.tpu_cidr_range, '10.240.0.0/29') - - def testTpuAcceleratorTypeFlag(self): - FLAGS['tpu_accelerator_type'].parse('tpu-v1') - result = benchmark_config_spec._TpuGroupSpec( - _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.full_spec - ) - self.assertEqual(result.tpu_accelerator_type, 'tpu-v1') - - def testTpuDescriptionFlag(self): - FLAGS['tpu_description'].parse('MyTfNode') + def testTpuTypeFlag(self): + FLAGS['tpu_type'].parse('v6e') result = benchmark_config_spec._TpuGroupSpec( _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.full_spec ) - self.assertEqual(result.tpu_description, 'MyTfNode') + self.assertEqual(result.tpu_type, 'v6e') - def testTpuNetworkFlag(self): - FLAGS['tpu_network'].parse('my-tf-network') + def testTpuTopologyFlag(self): + FLAGS['tpu_topology'].parse('1x1') result = benchmark_config_spec._TpuGroupSpec( _COMPONENT, _GROUP_NAME, flag_values=FLAGS, **self.full_spec ) - self.assertEqual(result.tpu_network, 'my-tf-network') + self.assertEqual(result.tpu_topology, '1x1') def testTpuTfVersion(self): FLAGS['tpu_tf_version'].parse('1.2') diff --git a/tests/providers/gcp/gcp_tpu_test.py b/tests/providers/gcp/gcp_tpu_test.py index 41ba58122d..a5fee588bc 100644 --- a/tests/providers/gcp/gcp_tpu_test.py +++ b/tests/providers/gcp/gcp_tpu_test.py @@ -37,20 +37,16 @@ class GcpTpuTestCase(pkb_common_test_case.PkbCommonTestCase): def CreateTpuSpecDict(self): return { 'tpu_name': 'pkb-tpu-123', - 'tpu_cidr_range': '192.168.0.0/29', - 'tpu_accelerator_type': 'tpu-v2', - 'tpu_description': 'MyTFNode', - 'tpu_network': 'default', + 'tpu_type': 'v4-8', + 'tpu_topology': '2x2', 'tpu_tf_version': 'nightly', 'tpu_zone': 'us-central1-a', - 'tpu_preemptible': True, } def setUp(self): super().setUp() FLAGS.run_uri = '123' FLAGS.project = '' - FLAGS.tpu_cores_per_donut = 8 FLAGS.gcloud_path = 'gcloud' mock_tpu_spec_attrs = self.CreateTpuSpecDict() @@ -77,17 +73,16 @@ def testCreate(self): self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertTrue( - command_string.startswith('gcloud compute tpus create pkb-tpu-123'), + command_string.startswith( + 'gcloud compute tpus tpu-vm create pkb-tpu-123' + ), command_string, ) self.assertIn('--project fakeproject', command_string) - self.assertIn('--range 192.168.0.0/29', command_string) - self.assertIn('--accelerator-type tpu-v2', command_string) - self.assertIn('--description MyTFNode', command_string) - self.assertIn('--network default', command_string) + self.assertIn('--type v4-8', command_string) + self.assertIn('--topology 2x2', command_string) self.assertIn('--version nightly', command_string) self.assertIn('--zone us-central1-a', command_string) - self.assertIn('--preemptible', command_string) def testStockout(self): stderr = """Create request issued for: [pkb-tpu-train-9baf32202] @@ -110,7 +105,9 @@ def testDelete(self): self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertTrue( - command_string.startswith('gcloud compute tpus delete pkb-tpu-123') + command_string.startswith( + 'gcloud compute tpus tpu-vm delete pkb-tpu-123' + ) ) self.assertIn('--project fakeproject', command_string) self.assertIn('--zone us-central1-a', command_string) @@ -122,7 +119,9 @@ def testExists(self): self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertTrue( - command_string.startswith('gcloud compute tpus describe pkb-tpu-123') + command_string.startswith( + 'gcloud compute tpus tpu-vm describe pkb-tpu-123' + ) ) self.assertIn('--project fakeproject', command_string) self.assertIn('--zone us-central1-a', command_string) @@ -130,31 +129,9 @@ def testExists(self): def testGetName(self): with self._PatchCriticalObjects(): tpu = gcp_tpu.GcpTpu(self.mock_tpu_spec) - name = tpu.GetName() + name = tpu.spec.tpu_name self.assertEqual(name, 'pkb-tpu-123') - def testGetNumShards(self): - with self._PatchCriticalObjects( - stdout=( - '{"networkEndpoints": [{"ipAddress": "10.199.12.2", "port": 8470}]}' - ) - ): - tpu = gcp_tpu.GcpTpu(self.mock_tpu_spec) - num_shards = tpu.GetNumShards() - self.assertEqual(num_shards, 8) - - def testGetMasterGrpcAddress(self): - with self._PatchCriticalObjects(stdout="""{ - "networkEndpoints": [{ - "ipAddress": "10.199.12.2", - "port": 8470 - }] -} - """): - tpu = gcp_tpu.GcpTpu(self.mock_tpu_spec) - ip_address = tpu.GetMasterGrpcAddress() - self.assertEqual(ip_address, 'grpc://10.199.12.2:8470') - if __name__ == '__main__': unittest.main()