Skip to content

Commit a65cda6

Browse files
authored
[support bundle] Monitor for cancellation without accidental task-blocking (#9268)
Fixes #9259
1 parent 58f95de commit a65cda6

File tree

1 file changed

+116
-44
lines changed

1 file changed

+116
-44
lines changed

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 116 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -502,51 +502,64 @@ impl BundleCollection {
502502
self: &Arc<Self>,
503503
dir: &Utf8TempDir,
504504
) -> anyhow::Result<SupportBundleCollectionReport> {
505-
let mut collection = Box::pin(self.collect_bundle_as_file(&dir));
506-
507-
// We periodically check the state of the support bundle - if a user
508-
// explicitly cancels it, we should stop the collection process and
509-
// return.
510-
let work_duration = tokio::time::Duration::from_secs(5);
511-
let mut yield_interval = tokio::time::interval_at(
512-
tokio::time::Instant::now() + work_duration,
513-
work_duration,
514-
);
515-
516-
loop {
517-
tokio::select! {
518-
// Timer fired mid-collection - let's check if we should stop.
519-
_ = yield_interval.tick() => {
520-
trace!(
521-
&self.log,
522-
"Checking if Bundle Collection cancelled";
523-
"bundle" => %self.bundle.id
524-
);
505+
// TL;DR: This `tokio::select` is allowed to poll multiple futures, but
506+
// should not do any async work within the body of any chosen branch. A
507+
// previous iteration of this code polled the "collection" as "&mut
508+
// collection", and checked the status of the support bundle within a
509+
// branch of the "select" polling "yield_interval.tick()".
510+
//
511+
// We organize this work to "check for cancellation" as a whole future
512+
// for a critical, but subtle reason: After the tick timer yields,
513+
// we may then try to `await` a database function.
514+
//
515+
// This, at a surface-level glance seems innocent enough. However, there
516+
// is something potentially insidious here: if calling a datastore
517+
// function - such as "support_bundle_get" - awaits acquiring access
518+
// to a connection from the connection pool, while creating the
519+
// collection ALSO potentially awaits acquiring access to the
520+
// connection pool, it is possible for:
521+
//
522+
// 1. The `&mut collection` arm to have created a future, currently
523+
// yielded, which wants access to this underlying resource.
524+
// 2. The current operation executing in `support_bundle_get` to
525+
// be awaiting access to this same underlying resource.
526+
//
527+
// In this specific case, the connection pool would be attempting to
528+
// yield to the `&mut collection` arm, which cannot run, if we were
529+
// awaiting in the body of a different async select arm. This would
530+
// result in a deadlock.
531+
//
532+
// In the future, we may attempt to make access to the connection pool
533+
// safer from concurrent asynchronous access - it is unsettling that
534+
// multiple concurrent `.claim()` functions can cause this behavior -
535+
// but in the meantime, we perform this cancellation check in a single
536+
// future that always is polled concurrently with the collection work.
537+
// Because of this separation, each future is polled until one
538+
// completes, at which point we deterministically exit.
539+
//
540+
// For more details, see:
541+
// https://github.com/oxidecomputer/omicron/issues/9259
525542

526-
let bundle = self.datastore.support_bundle_get(
527-
&self.opctx,
528-
self.bundle.id.into()
529-
).await?;
530-
if !matches!(bundle.state, SupportBundleState::Collecting) {
531-
warn!(
532-
&self.log,
533-
"Support Bundle cancelled - stopping collection";
534-
"bundle" => %self.bundle.id,
535-
"state" => ?self.bundle.state
536-
);
537-
anyhow::bail!("Support Bundle Cancelled");
538-
}
539-
},
540-
// Otherwise, keep making progress on the collection itself.
541-
report = &mut collection => {
542-
info!(
543-
&self.log,
544-
"Bundle Collection completed";
545-
"bundle" => %self.bundle.id
546-
);
547-
return report;
548-
},
549-
}
543+
tokio::select! {
544+
// Returns if the bundle should no longer be collected.
545+
why = self.check_for_cancellation() => {
546+
warn!(
547+
&self.log,
548+
"Support Bundle cancelled - stopping collection";
549+
"bundle" => %self.bundle.id,
550+
"state" => ?self.bundle.state
551+
);
552+
return Err(why);
553+
},
554+
// Otherwise, keep making progress on the collection itself.
555+
report = self.collect_bundle_as_file(&dir) => {
556+
info!(
557+
&self.log,
558+
"Bundle Collection completed";
559+
"bundle" => %self.bundle.id
560+
);
561+
return report;
562+
},
550563
}
551564
}
552565

@@ -656,6 +669,65 @@ impl BundleCollection {
656669
Ok(())
657670
}
658671

672+
// Indefinitely perform periodic checks about whether or not we should
673+
// cancel the bundle.
674+
//
675+
// Returns an error if:
676+
// - The bundle state is no longer SupportBundleState::Collecting
677+
// (which happens if the bundle has been explicitly cancelled, or
678+
// if the backing storage has been expunged).
679+
// - The bundle has been deleted
680+
//
681+
// Otherwise, keeps checking indefinitely while polled.
682+
async fn check_for_cancellation(&self) -> anyhow::Error {
683+
let work_duration = tokio::time::Duration::from_secs(5);
684+
let mut yield_interval = tokio::time::interval_at(
685+
tokio::time::Instant::now() + work_duration,
686+
work_duration,
687+
);
688+
689+
loop {
690+
// Timer fired mid-collection - check if we should stop.
691+
yield_interval.tick().await;
692+
trace!(
693+
self.log,
694+
"Checking if Bundle Collection cancelled";
695+
"bundle" => %self.bundle.id
696+
);
697+
698+
match self
699+
.datastore
700+
.support_bundle_get(&self.opctx, self.bundle.id.into())
701+
.await
702+
{
703+
Ok(SupportBundle {
704+
state: SupportBundleState::Collecting,
705+
..
706+
}) => {
707+
// Bundle still collecting; continue...
708+
continue;
709+
}
710+
Ok(_) => {
711+
// Not collecting, for any reason: Time to exit
712+
return anyhow::anyhow!("Support Bundle Cancelled");
713+
}
714+
Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => {
715+
return anyhow::anyhow!("Support Bundle Deleted");
716+
}
717+
Err(err) => {
718+
warn!(
719+
self.log,
720+
"Database error checking bundle cancellation";
721+
InlineErrorChain::new(&err)
722+
);
723+
724+
// If we cannot contact the database, retry later
725+
continue;
726+
}
727+
}
728+
}
729+
}
730+
659731
// Perform the work of collecting the support bundle into a temporary directory
660732
//
661733
// - "dir" is a directory where data can be stored.

0 commit comments

Comments
 (0)