@@ -502,51 +502,64 @@ impl BundleCollection {
502502 self : & Arc < Self > ,
503503 dir : & Utf8TempDir ,
504504 ) -> anyhow:: Result < SupportBundleCollectionReport > {
505- let mut collection = Box :: pin ( self . collect_bundle_as_file ( & dir) ) ;
506-
507- // We periodically check the state of the support bundle - if a user
508- // explicitly cancels it, we should stop the collection process and
509- // return.
510- let work_duration = tokio:: time:: Duration :: from_secs ( 5 ) ;
511- let mut yield_interval = tokio:: time:: interval_at (
512- tokio:: time:: Instant :: now ( ) + work_duration,
513- work_duration,
514- ) ;
515-
516- loop {
517- tokio:: select! {
518- // Timer fired mid-collection - let's check if we should stop.
519- _ = yield_interval. tick( ) => {
520- trace!(
521- & self . log,
522- "Checking if Bundle Collection cancelled" ;
523- "bundle" => %self . bundle. id
524- ) ;
505+ // TL;DR: This `tokio::select` is allowed to poll multiple futures, but
506+ // should not do any async work within the body of any chosen branch. A
507+ // previous iteration of this code polled the "collection" as "&mut
508+ // collection", and checked the status of the support bundle within a
509+ // branch of the "select" polling "yield_interval.tick()".
510+ //
511+ // We organize this work to "check for cancellation" as a whole future
512+ // for a critical, but subtle reason: After the tick timer yields,
513+ // we may then try to `await` a database function.
514+ //
515+ // This, at a surface-level glance seems innocent enough. However, there
516+ // is something potentially insidious here: if calling a datastore
517+ // function - such as "support_bundle_get" - awaits acquiring access
518+ // to a connection from the connection pool, while creating the
519+ // collection ALSO potentially awaits acquiring access to the
520+ // connection pool, it is possible for:
521+ //
522+ // 1. The `&mut collection` arm to have created a future, currently
523+ // yielded, which wants access to this underlying resource.
524+ // 2. The current operation executing in `support_bundle_get` to
525+ // be awaiting access to this same underlying resource.
526+ //
527+ // In this specific case, the connection pool would be attempting to
528+ // yield to the `&mut collection` arm, which cannot run, if we were
529+ // awaiting in the body of a different async select arm. This would
530+ // result in a deadlock.
531+ //
532+ // In the future, we may attempt to make access to the connection pool
533+ // safer from concurrent asynchronous access - it is unsettling that
534+ // multiple concurrent `.claim()` functions can cause this behavior -
535+ // but in the meantime, we perform this cancellation check in a single
536+ // future that always is polled concurrently with the collection work.
537+ // Because of this separation, each future is polled until one
538+ // completes, at which point we deterministically exit.
539+ //
540+ // For more details, see:
541+ // https://github.com/oxidecomputer/omicron/issues/9259
525542
526- let bundle = self . datastore. support_bundle_get(
527- & self . opctx,
528- self . bundle. id. into( )
529- ) . await ?;
530- if !matches!( bundle. state, SupportBundleState :: Collecting ) {
531- warn!(
532- & self . log,
533- "Support Bundle cancelled - stopping collection" ;
534- "bundle" => %self . bundle. id,
535- "state" => ?self . bundle. state
536- ) ;
537- anyhow:: bail!( "Support Bundle Cancelled" ) ;
538- }
539- } ,
540- // Otherwise, keep making progress on the collection itself.
541- report = & mut collection => {
542- info!(
543- & self . log,
544- "Bundle Collection completed" ;
545- "bundle" => %self . bundle. id
546- ) ;
547- return report;
548- } ,
549- }
543+ tokio:: select! {
544+ // Returns if the bundle should no longer be collected.
545+ why = self . check_for_cancellation( ) => {
546+ warn!(
547+ & self . log,
548+ "Support Bundle cancelled - stopping collection" ;
549+ "bundle" => %self . bundle. id,
550+ "state" => ?self . bundle. state
551+ ) ;
552+ return Err ( why) ;
553+ } ,
554+ // Otherwise, keep making progress on the collection itself.
555+ report = self . collect_bundle_as_file( & dir) => {
556+ info!(
557+ & self . log,
558+ "Bundle Collection completed" ;
559+ "bundle" => %self . bundle. id
560+ ) ;
561+ return report;
562+ } ,
550563 }
551564 }
552565
@@ -656,6 +669,65 @@ impl BundleCollection {
656669 Ok ( ( ) )
657670 }
658671
672+ // Indefinitely perform periodic checks about whether or not we should
673+ // cancel the bundle.
674+ //
675+ // Returns an error if:
676+ // - The bundle state is no longer SupportBundleState::Collecting
677+ // (which happens if the bundle has been explicitly cancelled, or
678+ // if the backing storage has been expunged).
679+ // - The bundle has been deleted
680+ //
681+ // Otherwise, keeps checking indefinitely while polled.
682+ async fn check_for_cancellation ( & self ) -> anyhow:: Error {
683+ let work_duration = tokio:: time:: Duration :: from_secs ( 5 ) ;
684+ let mut yield_interval = tokio:: time:: interval_at (
685+ tokio:: time:: Instant :: now ( ) + work_duration,
686+ work_duration,
687+ ) ;
688+
689+ loop {
690+ // Timer fired mid-collection - check if we should stop.
691+ yield_interval. tick ( ) . await ;
692+ trace ! (
693+ self . log,
694+ "Checking if Bundle Collection cancelled" ;
695+ "bundle" => %self . bundle. id
696+ ) ;
697+
698+ match self
699+ . datastore
700+ . support_bundle_get ( & self . opctx , self . bundle . id . into ( ) )
701+ . await
702+ {
703+ Ok ( SupportBundle {
704+ state : SupportBundleState :: Collecting ,
705+ ..
706+ } ) => {
707+ // Bundle still collecting; continue...
708+ continue ;
709+ }
710+ Ok ( _) => {
711+ // Not collecting, for any reason: Time to exit
712+ return anyhow:: anyhow!( "Support Bundle Cancelled" ) ;
713+ }
714+ Err ( Error :: ObjectNotFound { .. } | Error :: NotFound { .. } ) => {
715+ return anyhow:: anyhow!( "Support Bundle Deleted" ) ;
716+ }
717+ Err ( err) => {
718+ warn ! (
719+ self . log,
720+ "Database error checking bundle cancellation" ;
721+ InlineErrorChain :: new( & err)
722+ ) ;
723+
724+ // If we cannot contact the database, retry later
725+ continue ;
726+ }
727+ }
728+ }
729+ }
730+
659731 // Perform the work of collecting the support bundle into a temporary directory
660732 //
661733 // - "dir" is a directory where data can be stored.
0 commit comments