Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ccac624
feat: do not swallow subnet assignment on upgrade loop errors
pierugo-dfinity Dec 1, 2025
710b4fc
docs: revert removed docs
pierugo-dfinity Dec 1, 2025
3801182
re-trigger CI
pierugo-dfinity Dec 1, 2025
14d9854
style
pierugo-dfinity Dec 2, 2025
e128f33
test: distinguish tests for recoverable vs unrecoverable CUP corruptions
pierugo-dfinity Dec 3, 2025
e58c655
style: exhaustive matches + readability
pierugo-dfinity Dec 3, 2025
bee2dea
fix: unrecoverable actually on failover nodes & recoverable as new va…
pierugo-dfinity Dec 4, 2025
5dc81bb
fix: typo
pierugo-dfinity Dec 4, 2025
96f34ae
fix: typo
pierugo-dfinity Dec 4, 2025
1a3a2a9
Merge branch 'master' into pierugo/recovery-tests/soft-vs-hard-cup-co…
pierugo-dfinity Dec 4, 2025
6effe7b
fix merge: corrupt CUP on chain keys
pierugo-dfinity Dec 4, 2025
ad468ea
fix: remove TODO
pierugo-dfinity Dec 4, 2025
3276563
fix: typo
pierugo-dfinity Dec 4, 2025
32bc581
fix: check that unassigned nodes don't delete state & CUP in unrecove…
pierugo-dfinity Dec 5, 2025
d71f87c
style
pierugo-dfinity Dec 5, 2025
d26b9a7
fix: typo
pierugo-dfinity Dec 5, 2025
2c6dd52
Merge branch 'pierugo/orchestrator/return-subnet-id' into pierugo/rec…
pierugo-dfinity Dec 5, 2025
f67f8ef
Merge branch 'master' into pierugo/recovery-tests/soft-vs-hard-cup-co…
pierugo-dfinity Dec 5, 2025
aba15f2
feat: unrecoverable case without chain keys
pierugo-dfinity Dec 5, 2025
d38ccce
docs
pierugo-dfinity Dec 5, 2025
4a50cb0
Revert "Merge branch 'pierugo/orchestrator/return-subnet-id' into pie…
pierugo-dfinity Dec 8, 2025
6d2d15f
Reapply "Merge branch 'pierugo/orchestrator/return-subnet-id' into pi…
pierugo-dfinity Dec 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rs/orchestrator/image_upgrader/src/image_upgrader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,5 +308,5 @@ pub trait ImageUpgrader<V: Clone + Debug + PartialEq + Eq + Send + Sync>: Send +
/// * Check if an image upgrade is scheduled.
/// * Optionally prepare the upgrade in advance using `prepare_upgrade`.
/// * Once it is time to upgrade, execute it using `execute_upgrade`
async fn check_for_upgrade(&mut self) -> UpgradeResult<Self::UpgradeType>;
async fn check_for_upgrade(&mut self) -> Self::UpgradeType;
}
52 changes: 24 additions & 28 deletions rs/orchestrator/src/orchestrator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use crate::{
registration::NodeRegistration,
registry_helper::RegistryHelper,
ssh_access_manager::SshAccessManager,
upgrade::{OrchestratorControlFlow, Upgrade},
upgrade::{Upgrade, UpgradeCheckResult},
};
use backoff::ExponentialBackoffBuilder;
use get_if_addrs::get_if_addrs;
Expand Down Expand Up @@ -390,35 +390,35 @@ impl Orchestrator {
// in case it gets stuck in an unexpected situation for longer than 15 minutes.
const UPGRADE_TIMEOUT: Duration = Duration::from_secs(60 * 15);

// Since the orchestrator is just starting, the last flow must have been a `Stop`
let mut last_flow = OrchestratorControlFlow::Stop;
// Since the orchestrator is just starting, the last upgrade check must have been a `Stop`
let mut last_upgrade_result = UpgradeCheckResult::Stop;

loop {
match tokio::time::timeout(UPGRADE_TIMEOUT, upgrade.check_for_upgrade()).await {
Ok(Ok(control_flow)) => {
Ok(upgrade_result) => {
// Update the subnet assignment based on the latest upgrade result.
*subnet_assignment.write().unwrap() = upgrade_result.as_subnet_assignment();

if let Err(err) = upgrade_result.as_result() {
warn!(log, "Check for upgrade failed: {err}");
upgrade.metrics.failed_consecutive_upgrade_checks.inc();
continue;
}

// Starting from here, the upgrade check was successful.
upgrade.metrics.failed_consecutive_upgrade_checks.reset();

match control_flow {
OrchestratorControlFlow::Assigned(subnet_id)
| OrchestratorControlFlow::Leaving(subnet_id) => {
*subnet_assignment.write().unwrap() =
SubnetAssignment::Assigned(subnet_id);
}
OrchestratorControlFlow::Unassigned => {
*subnet_assignment.write().unwrap() = SubnetAssignment::Unassigned;
}
OrchestratorControlFlow::Stop => {
// Wake up all orchestrator tasks and instruct them to stop.
cancellation_token.cancel();
break;
}
if matches!(upgrade_result, UpgradeCheckResult::Stop) {
// Wake up all orchestrator tasks and instruct them to stop.
cancellation_token.cancel();
break;
}

let node_id = upgrade.node_id();
match (&last_flow, &control_flow) {
match (&last_upgrade_result, &upgrade_result) {
(
OrchestratorControlFlow::Assigned(subnet_id),
OrchestratorControlFlow::Leaving(_),
UpgradeCheckResult::Assigned(subnet_id),
UpgradeCheckResult::Leaving(_),
) => {
UtilityCommand::notify_host(
&format!(
Expand All @@ -431,8 +431,8 @@ impl Orchestrator {
);
}
(
OrchestratorControlFlow::Leaving(subnet_id),
OrchestratorControlFlow::Unassigned,
UpgradeCheckResult::Leaving(subnet_id),
UpgradeCheckResult::Unassigned,
) => {
UtilityCommand::notify_host(
&format!(
Expand All @@ -444,11 +444,7 @@ impl Orchestrator {
// Other transitions are not important at the moment.
_ => {}
}
last_flow = control_flow;
}
Ok(Err(err)) => {
warn!(log, "Check for upgrade failed: {err}");
upgrade.metrics.failed_consecutive_upgrade_checks.inc();
last_upgrade_result = upgrade_result;
}
Err(err) => {
warn!(log, "Check for upgrade timed out: {err}");
Expand Down
Loading