From 2aba06fa38b5f7f67eb658af5e0d937aaf23dfce Mon Sep 17 00:00:00 2001 From: jimmycgz Date: Mon, 17 Nov 2025 22:21:40 -0500 Subject: [PATCH 1/2] fix gcp teardown bug --- perfkitbenchmarker/benchmark_spec.py | 144 +++++++++++++++++++++++---- 1 file changed, 127 insertions(+), 17 deletions(-) diff --git a/perfkitbenchmarker/benchmark_spec.py b/perfkitbenchmarker/benchmark_spec.py index 662ef894ea..dc3e776762 100644 --- a/perfkitbenchmarker/benchmark_spec.py +++ b/perfkitbenchmarker/benchmark_spec.py @@ -1100,7 +1100,9 @@ def Provision(self): self.data_discovery_service.Create() def Delete(self): - if self.deleted: + # Don't trust the deleted flag during teardown-only runs + # This handles cases where pickle data is corrupted or provision failed + if self.deleted and stages.TEARDOWN not in FLAGS.run_stage: return should_freeze = ( @@ -1179,14 +1181,29 @@ def Delete(self): for placement_group_object in self.placement_groups.values(): placement_group_object.Delete() - for firewall in self.firewalls.values(): - try: - firewall.DisallowAllPorts() - except Exception: - logging.exception( - 'Got an exception disabling firewalls. ' - 'Attempting to continue tearing down.' - ) + # Validate network/firewall objects and fall back to direct cleanup if invalid + valid_networks = self._ValidateNetworkObjects() + valid_firewalls = self._ValidateFirewallObjects() + + if not valid_networks or not valid_firewalls: + logging.warning( + 'Invalid network/firewall objects detected (corrupted pickle data). ' + 'Falling back to direct GCP resource cleanup.' + ) + self._CleanupOrphanedGCPResources() + else: + # Standard cleanup with validated objects + for firewall in self.firewalls.values(): + try: + if hasattr(firewall, 'Delete') and callable(firewall.Delete): + firewall.Delete() + else: + logging.warning('Firewall object missing Delete method, skipping') + except Exception: + logging.exception( + 'Got an exception deleting firewalls. ' + 'Attempting to continue tearing down.' + ) if self.container_cluster: self.container_cluster.DeleteServices() @@ -1196,14 +1213,18 @@ def Delete(self): if self.cluster: self.cluster.Delete() - for net in self.networks.values(): - try: - net.Delete() - except Exception: - logging.exception( - 'Got an exception deleting networks. ' - 'Attempting to continue tearing down.' - ) + if valid_networks: + for net in self.networks.values(): + try: + if hasattr(net, 'Delete') and callable(net.Delete): + net.Delete() + else: + logging.warning('Network object missing Delete method, skipping') + except Exception: + logging.exception( + 'Got an exception deleting networks. ' + 'Attempting to continue tearing down.' + ) if hasattr(self, 'vpn_service') and self.vpn_service: self.vpn_service.Delete() @@ -1390,6 +1411,95 @@ def GetBenchmarkSpec(cls, benchmark_module, config, uid): context.SetThreadBenchmarkSpec(bm_spec) return bm_spec + def _ValidateNetworkObjects(self) -> bool: + """Validate that network objects are actual network instances with Delete methods.""" + if not self.networks: + return False + + # Check for corrupted pickle data: keys should be network names, not JSON fragments + for key, net in self.networks.items(): + # Keys that look like JSON fragments indicate corruption + if key.strip() in ['{', '}', '[', ']', ','] or key.strip().startswith('"'): + logging.warning(f'Network dict has corrupted keys (JSON fragments): {key[:50]}') + return False + + if not hasattr(net, 'Delete') or not callable(net.Delete): + logging.warning(f'Network object {key} is invalid (no Delete method)') + return False + + return True + + def _ValidateFirewallObjects(self) -> bool: + """Validate that firewall objects are actual firewall instances with Delete methods.""" + if not self.firewalls: + # Empty firewalls dict is valid (no firewalls created yet) + return True + for fw in self.firewalls.values(): + if not hasattr(fw, 'Delete') or not callable(fw.Delete): + return False + return True + + def _CleanupOrphanedGCPResources(self) -> None: + """Directly query and delete GCP resources by run_uri pattern. + + This method is called when network/firewall objects are invalid (corrupted pickle). + It bypasses the object-based deletion and uses gcloud commands directly. + """ + if FLAGS.cloud != provider_info.GCP: + logging.warning('Direct cleanup only supported for GCP') + return + + import subprocess + + project = FLAGS.project + if not project: + logging.error('No project specified for cleanup') + return + + logging.info(f'Cleaning up orphaned GCP resources for run_uri: {FLAGS.run_uri}') + + # Delete firewall rules first (dependency for networks) + try: + # List firewalls matching run_uri + result = subprocess.run( + ['gcloud', 'compute', 'firewall-rules', 'list', + '--project', project, + '--filter', f'name~-{FLAGS.run_uri}', + '--format', 'value(name)'], + capture_output=True, text=True, check=False + ) + if result.returncode == 0 and result.stdout.strip(): + for firewall_name in result.stdout.strip().split('\n'): + logging.info(f'Deleting orphaned firewall: {firewall_name}') + subprocess.run( + ['gcloud', 'compute', 'firewall-rules', 'delete', firewall_name, + '--project', project, '--quiet'], + check=False + ) + except Exception as e: + logging.exception(f'Failed to clean up firewalls: {e}') + + # Delete networks + try: + # List networks matching run_uri + result = subprocess.run( + ['gcloud', 'compute', 'networks', 'list', + '--project', project, + '--filter', f'name~pkb-network.*{FLAGS.run_uri}', + '--format', 'value(name)'], + capture_output=True, text=True, check=False + ) + if result.returncode == 0 and result.stdout.strip(): + for network_name in result.stdout.strip().split('\n'): + logging.info(f'Deleting orphaned network: {network_name}') + subprocess.run( + ['gcloud', 'compute', 'networks', 'delete', network_name, + '--project', project, '--quiet'], + check=False + ) + except Exception as e: + logging.exception(f'Failed to clean up networks: {e}') + def CheckPrerequisites(self) -> None: """Checks preconditions for the benchmark_spec.""" for app_group in self.app_groups: From b4ae328dc012456bb4e80990711e6d911b70a526 Mon Sep 17 00:00:00 2001 From: jimmycgz Date: Mon, 17 Nov 2025 22:32:53 -0500 Subject: [PATCH 2/2] Fix incomplete teardown when provision fails Handle corrupted pickle data during teardown by: - Validating network/firewall objects before deletion - Detecting tuple keys and JSON fragments in corrupted data - Falling back to direct gcloud cleanup when objects invalid - Bypassing deleted flag during teardown-only runs This fixes the issue where failed provisions leave orphaned GCP networks and firewalls that cost money and block future runs. Fixes #6223 --- perfkitbenchmarker/benchmark_spec.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/perfkitbenchmarker/benchmark_spec.py b/perfkitbenchmarker/benchmark_spec.py index dc3e776762..855dff78d3 100644 --- a/perfkitbenchmarker/benchmark_spec.py +++ b/perfkitbenchmarker/benchmark_spec.py @@ -1416,11 +1416,19 @@ def _ValidateNetworkObjects(self) -> bool: if not self.networks: return False - # Check for corrupted pickle data: keys should be network names, not JSON fragments + # Check for corrupted pickle data for key, net in self.networks.items(): + # Handle tuple keys (which indicate corruption) + if isinstance(key, tuple): + logging.warning(f'Network dict has tuple keys (corrupted): {key}') + return False + + # Convert key to string for checking + key_str = str(key) + # Keys that look like JSON fragments indicate corruption - if key.strip() in ['{', '}', '[', ']', ','] or key.strip().startswith('"'): - logging.warning(f'Network dict has corrupted keys (JSON fragments): {key[:50]}') + if key_str.strip() in ['{', '}', '[', ']', ','] or key_str.strip().startswith('"'): + logging.warning(f'Network dict has corrupted keys (JSON fragments): {key_str[:50]}') return False if not hasattr(net, 'Delete') or not callable(net.Delete):