Skip to content
This repository was archived by the owner on Oct 10, 2025. It is now read-only.

Commit f19fd8c

Browse files
committed
refactor: implement CommandResult value object and improve SSH connection reliability
- Add CommandResult value object with factory methods (success_result, failure_result) - Replace hash-based SSH command results with consistent object methods - Update all SSH Connection and Channel classes to use CommandResult - Enhance SSH reconnection logic with improved timing for VM reboots - Add comprehensive error handling and debug output for cloud-init monitoring - Update all unit, integration, and container tests for CommandResult API - Improve E2E test reliability and performance (consistently ~2.5 minutes) - Fix system summary Docker detection with multiple fallback methods - All tests passing with consistent CommandResult object access patterns
1 parent 96688da commit f19fd8c

File tree

11 files changed

+497
-80
lines changed

11 files changed

+497
-80
lines changed

lib/TorrustDeploy/App/Command/Provision.pm

Lines changed: 121 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ sub execute {
6161
# Wait for cloud-init completion
6262
$self->_wait_for_cloud_init($ssh_connection);
6363

64+
# Force reconnection after cloud-init completes (VM reboots during cloud-init)
65+
say "🔄 Refreshing SSH connection after cloud-init reboot...";
66+
STDOUT->flush();
67+
$ssh_connection->force_reconnect();
68+
6469
# Verify SSH key authentication after cloud-init completes
6570
$self->_verify_ssh_key_auth($ssh_connection);
6671

@@ -108,6 +113,7 @@ sub _wait_for_cloud_init {
108113

109114
say "Waiting for cloud-init to complete...";
110115
say "This may take several minutes while packages are installed and configured.";
116+
STDOUT->flush();
111117

112118
my $completion_file = "/var/lib/cloud/torrust-setup-complete";
113119
my $max_attempts = 360; # 30 minutes with 5-second intervals
@@ -117,60 +123,124 @@ sub _wait_for_cloud_init {
117123

118124
# Step 1: Wait until SSH connection is available (for password auth to check cloud-init)
119125
say "⏳ Waiting for SSH service to become available...";
126+
STDOUT->flush();
120127

121128
while ($attempt < $max_attempts && !$ssh_connected) {
122129
$attempt++;
123130

124131
if ($ssh_connection->test_password_connection()) {
125132
$ssh_connected = 1;
126133
say "✅ SSH password connection established to " . $ssh_connection->host;
134+
STDOUT->flush();
127135
} else {
128136
if ($attempt % 6 == 0) { # Every 30 seconds
129137
say " [Waiting for SSH connection... ${attempt}0s elapsed]";
138+
STDOUT->flush();
130139
}
131140
sleep(5);
132141
}
133142
}
134143

135144
if (!$ssh_connected) {
136145
say "❌ Failed to establish SSH connection to " . $ssh_connection->host . " after " . ($max_attempts * 5 / 60) . " minutes";
146+
STDOUT->flush();
137147
$self->_print_cloud_init_logs($ssh_connection);
138148
die "SSH connection failed";
139149
}
140150

141151
# Step 2: Wait until cloud-init completion marker is created
142152
say "⏳ Waiting for cloud-init to complete...";
153+
STDOUT->flush();
143154

144155
$attempt = 0;
156+
my $consecutive_ssh_failures = 0;
145157
while ($attempt < $max_attempts) {
146158
$attempt++;
147159

148160
my $result = $ssh_connection->execute_command("test -f $completion_file");
149161

150-
if ($result->{success}) {
162+
# Debug: Always show result details when exit code is 0
163+
if ($result->exit_code == 0) {
164+
say " [DEBUG] File exists! Exit code: " . $result->exit_code .
165+
", Success method: " . ($result->success ? 'true' : 'false') .
166+
", Output: '" . ($result->output // 'EMPTY') . "'";
167+
STDOUT->flush();
168+
}
169+
170+
if ($result->success) {
151171
say "✅ Cloud-init setup completed successfully!";
172+
STDOUT->flush();
152173

153174
# Show completion message
154175
my $completion_result = $ssh_connection->execute_command("cat $completion_file");
155-
if ($completion_result->{success} && $completion_result->{output}) {
156-
chomp $completion_result->{output};
157-
say "📅 Completion marker: " . $completion_result->{output};
176+
if ($completion_result->success && $completion_result->output) {
177+
chomp(my $output = $completion_result->output);
178+
say "📅 Completion marker: " . $output;
179+
STDOUT->flush();
158180
}
159181
$cloud_init_success = 1;
160182
last;
183+
} else {
184+
# Track consecutive SSH failures (exit code 255)
185+
if ($result->exit_code == 255) {
186+
$consecutive_ssh_failures++;
187+
# If we have too many consecutive SSH failures, try to re-establish password connection
188+
if ($consecutive_ssh_failures >= 12) { # 1 minute of consecutive failures
189+
say "⚠️ SSH connection lost, attempting to re-establish (VM may be rebooting)...";
190+
say " [Waiting 30s for VM to complete reboot...]";
191+
STDOUT->flush();
192+
sleep(30); # Give VM time to fully reboot
193+
194+
# Try to re-establish password connection (VM might have rebooted)
195+
my $reconnect_attempts = 0;
196+
while ($reconnect_attempts < 12 && !$ssh_connection->test_password_connection()) {
197+
$reconnect_attempts++;
198+
say " [Reconnection attempt $reconnect_attempts/12...]";
199+
STDOUT->flush();
200+
sleep(15); # Wait longer between attempts
201+
}
202+
203+
if ($ssh_connection->test_password_connection()) {
204+
say "✅ SSH connection re-established!";
205+
STDOUT->flush();
206+
$consecutive_ssh_failures = 0; # Reset counter after successful reconnection
207+
} else {
208+
say "❌ Failed to re-establish SSH connection after VM reboot.";
209+
say " [DEBUG] Last error: " . $result->output;
210+
STDOUT->flush();
211+
last;
212+
}
213+
}
214+
} else {
215+
# Reset counter for non-SSH failures (normal file-not-found errors)
216+
$consecutive_ssh_failures = 0;
217+
}
218+
219+
# Debug: Show why the command failed
220+
if ($attempt % 6 == 0) { # Every 30 seconds
221+
my $elapsed_seconds = $attempt * 5;
222+
say " [DEBUG ${elapsed_seconds}s] File check failed - Exit code: " . $result->exit_code .
223+
" (this is normal until cloud-init completes)";
224+
if ($consecutive_ssh_failures > 0) {
225+
say " [SSH failures: $consecutive_ssh_failures consecutive]";
226+
}
227+
STDOUT->flush();
228+
}
161229
}
162230

163231
# Show progress indicator every 2 minutes
164232
if ($attempt % 24 == 0) {
165233
my $elapsed_minutes = int($attempt * 5 / 60);
166234
say " [Cloud-init still running... ${elapsed_minutes} minutes elapsed]";
235+
STDOUT->flush();
167236
}
168237

169238
sleep(5);
170239
}
171240

172241
if (!$cloud_init_success) {
173242
say "❌ Timeout waiting for cloud-init to complete on " . $ssh_connection->host . " after " . ($max_attempts * 5 / 60) . " minutes";
243+
STDOUT->flush();
174244
$self->_print_cloud_init_logs($ssh_connection);
175245
die "Cloud-init timeout";
176246
}
@@ -180,19 +250,52 @@ sub _show_final_summary {
180250
my ($self, $ssh_connection) = @_;
181251

182252
say "📦 Final system summary:";
253+
STDOUT->flush();
254+
255+
# Try multiple approaches to detect Docker
256+
my $docker_result;
257+
my $docker_method = "unknown";
258+
259+
# Method 1: Try with newgrp (preferred for group activation)
260+
$docker_result = $ssh_connection->execute_command('newgrp docker -c "docker --version" 2>&1');
261+
if ($docker_result->success) {
262+
$docker_method = "newgrp";
263+
} else {
264+
# Method 2: Try with sudo (fallback)
265+
$docker_result = $ssh_connection->execute_command('sudo docker --version 2>&1');
266+
if ($docker_result->success) {
267+
$docker_method = "sudo";
268+
} else {
269+
# Method 3: Try direct command (may fail due to group membership)
270+
$docker_result = $ssh_connection->execute_command('docker --version 2>&1');
271+
if ($docker_result->success) {
272+
$docker_method = "direct";
273+
}
274+
}
275+
}
276+
277+
my $docker_version;
278+
if ($docker_result->success) {
279+
$docker_version = $docker_result->output . " (via $docker_method)";
280+
} else {
281+
$docker_version = "Docker not available - all methods failed";
282+
}
183283

184-
my $docker_result = $ssh_connection->execute_command('docker --version');
185-
my $docker_version = $docker_result->{success} ? $docker_result->{output} : "Docker not available";
186284
chomp $docker_version if $docker_version;
187-
say " Docker: $docker_version" if $docker_version;
285+
say " Docker: $docker_version";
286+
STDOUT->flush();
287+
288+
# Check firewall status
188289

189-
my $ufw_result = $ssh_connection->execute_command('ufw status | head -1');
190-
my $ufw_status = $ufw_result->{success} ? $ufw_result->{output} : "UFW not available";
290+
my $ufw_result = $ssh_connection->execute_command('sudo ufw status | head -1');
291+
my $ufw_status = $ufw_result->success ? $ufw_result->output : "UFW not available";
191292
chomp $ufw_status if $ufw_status;
192293
say " Firewall: $ufw_status" if $ufw_status;
294+
STDOUT->flush();
193295

194296
say "Provisioning completed successfully!";
195297
say "VM is ready at IP: " . $ssh_connection->host;
298+
STDOUT->flush();
196299
}
197300

198301
sub _print_cloud_init_logs {
@@ -203,16 +306,16 @@ sub _print_cloud_init_logs {
203306
# Print cloud-init-output.log
204307
say "=== /var/log/cloud-init-output.log ===";
205308
my $output_result = $ssh_connection->execute_command_with_sudo('cat /var/log/cloud-init-output.log');
206-
if ($output_result->{success}) {
207-
print $output_result->{output};
309+
if ($output_result->success) {
310+
print $output_result->output;
208311
} else {
209312
say "Cloud-init output log not available";
210313
}
211314

212315
say "=== /var/log/cloud-init.log ===";
213316
my $main_result = $ssh_connection->execute_command_with_sudo('cat /var/log/cloud-init.log');
214-
if ($main_result->{success}) {
215-
print $main_result->{output};
317+
if ($main_result->success) {
318+
print $main_result->output;
216319
} else {
217320
say "Cloud-init main log not available";
218321
}
@@ -222,6 +325,7 @@ sub _verify_ssh_key_auth {
222325
my ($self, $ssh_connection) = @_;
223326

224327
say "🔑 Checking SSH key authentication...";
328+
STDOUT->flush();
225329

226330
# SSH authentication might need time to fully stabilize after cloud-init reboot
227331
# Try with progressive delays: immediate, 5s, 10s, 15s
@@ -231,6 +335,7 @@ sub _verify_ssh_key_auth {
231335
if ($attempt > 0) {
232336
my $delay = $retry_delays[$attempt];
233337
say "⏳ Waiting ${delay}s before retry attempt " . ($attempt + 1) . "...";
338+
STDOUT->flush();
234339
sleep $delay;
235340
}
236341

@@ -243,16 +348,19 @@ sub _verify_ssh_key_auth {
243348
if ($fresh_ssh->test_key_connection()) {
244349
say "✅ SSH key authentication is working correctly!";
245350
say "You can now connect using: ssh -i " . $fresh_ssh->ssh_key_path . " " . $fresh_ssh->username . "@" . $fresh_ssh->host;
351+
STDOUT->flush();
246352
return;
247353
}
248354

249355
if ($attempt < $#retry_delays) {
250356
say "⚠️ SSH key authentication failed, will retry...";
357+
STDOUT->flush();
251358
}
252359
}
253360

254361
# All retries failed
255362
say "❌ SSH key authentication failed after all retries";
363+
STDOUT->flush();
256364
$self->_print_cloud_init_logs($ssh_connection);
257365
die "SSH key authentication failed";
258366
}

lib/TorrustDeploy/Infrastructure/SSH/Channel.pm

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package TorrustDeploy::Infrastructure::SSH::Channel;
22

33
use v5.38;
44
use Moo;
5+
use TorrustDeploy::Infrastructure::SSH::CommandResult;
56
use Carp qw(croak);
67
use namespace::clean;
78

@@ -18,17 +19,18 @@ has 'timeout' => (
1819
sub execute_command {
1920
my ($self, $command) = @_;
2021

21-
croak "Failed to execute command '$command': " . ($self->channel->error || 'Unknown error')
22-
unless $self->channel->exec($command);
22+
unless ($self->channel->exec($command)) {
23+
my $error = $self->channel->error || 'Unknown error';
24+
croak "Failed to execute command '$command': $error";
25+
}
2326

2427
my $output = $self->read_output();
2528
my $exit_code = $self->get_exit_code();
2629

27-
return {
30+
return TorrustDeploy::Infrastructure::SSH::CommandResult->new(
2831
output => $output,
29-
success => $exit_code == 0,
3032
exit_code => $exit_code,
31-
};
33+
);
3234
}
3335

3436
sub health_check {

0 commit comments

Comments
 (0)