diff --git a/Sources/Containerization/ContainerManager.swift b/Sources/Containerization/ContainerManager.swift index 8ac38413..e52a2db3 100644 --- a/Sources/Containerization/ContainerManager.swift +++ b/Sources/Containerization/ContainerManager.swift @@ -352,10 +352,12 @@ public struct ContainerManager: Sendable { /// - id: The container ID. /// - reference: The image reference. /// - rootfsSizeInBytes: The size of the root filesystem in bytes. Defaults to 8 GiB. + /// - readOnly: Whether to mount the root filesystem as read-only. public mutating func create( _ id: String, reference: String, rootfsSizeInBytes: UInt64 = 8.gib(), + readOnly: Bool = false, configuration: (inout LinuxContainer.Configuration) throws -> Void ) async throws -> LinuxContainer { let image = try await imageStore.get(reference: reference, pull: true) @@ -363,6 +365,7 @@ public struct ContainerManager: Sendable { id, image: image, rootfsSizeInBytes: rootfsSizeInBytes, + readOnly: readOnly, configuration: configuration ) } @@ -372,19 +375,24 @@ public struct ContainerManager: Sendable { /// - id: The container ID. /// - image: The image. /// - rootfsSizeInBytes: The size of the root filesystem in bytes. Defaults to 8 GiB. + /// - readOnly: Whether to mount the root filesystem as read-only. public mutating func create( _ id: String, image: Image, rootfsSizeInBytes: UInt64 = 8.gib(), + readOnly: Bool = false, configuration: (inout LinuxContainer.Configuration) throws -> Void ) async throws -> LinuxContainer { let path = try createContainerRoot(id) - let rootfs = try await unpack( + var rootfs = try await unpack( image: image, destination: path.appendingPathComponent("rootfs.ext4"), size: rootfsSizeInBytes ) + if readOnly { + rootfs.options.append("ro") + } return try await create( id, image: image, diff --git a/Sources/Containerization/LinuxContainer.swift b/Sources/Containerization/LinuxContainer.swift index dc502e2d..07c89852 100644 --- a/Sources/Containerization/LinuxContainer.swift +++ b/Sources/Containerization/LinuxContainer.swift @@ -235,25 +235,22 @@ public final class LinuxContainer: Container, Sendable { /// - vmm: The virtual machine manager that will handle launching the VM for the container. /// - logger: Optional logger for container operations. /// - configuration: A closure that configures the container by modifying the Configuration instance. - public init( + public convenience init( _ id: String, rootfs: Mount, vmm: VirtualMachineManager, logger: Logger? = nil, configuration: (inout Configuration) throws -> Void ) throws { - self.id = id - self.vmm = vmm - self.hostVsockPorts = Atomic(0x1000_0000) - self.guestVsockPorts = Atomic(0x1000_0000) - self.rootfs = rootfs - self.logger = logger - var config = Configuration() try configuration(&config) - - self.config = config - self.state = AsyncMutex(.initialized) + self.init( + id, + rootfs: rootfs, + vmm: vmm, + configuration: config, + logger: logger + ) } /// Create a new `LinuxContainer`. @@ -275,11 +272,10 @@ public final class LinuxContainer: Container, Sendable { self.vmm = vmm self.hostVsockPorts = Atomic(0x1000_0000) self.guestVsockPorts = Atomic(0x1000_0000) - self.rootfs = rootfs self.logger = logger - self.config = configuration self.state = AsyncMutex(.initialized) + self.rootfs = rootfs } private static func createDefaultRuntimeSpec(_ id: String) -> Spec { @@ -309,6 +305,10 @@ public final class LinuxContainer: Container, Sendable { // Linux toggles. spec.linux?.sysctl = config.sysctl + // If the rootfs was requested as read-only, set it in the OCI spec. + // We let the OCI runtime remount as ro, instead of doing it originally. + spec.root?.readonly = self.rootfs.options.contains("ro") + // Resource limits. // CPU: quota/period model where period is 100ms (100,000µs) and quota is cpus * period // Memory: limit in bytes @@ -394,11 +394,21 @@ extension LinuxContainer { try await self.state.withLock { state in try state.validateForCreate() + // This is a bit of an annoyance, but because the type we use for the rootfs is simply + // the same Mount type we use for non-rootfs mounts, it's possible someone passed 'ro' + // in the options (which should be perfectly valid). However, the problem is when we go to + // setup /etc/hosts and /etc/resolv.conf, as we'd get EROFS if they did supply 'ro'. + // To remedy this, remove any "ro" options before passing to VZ. Having the OCI runtime + // remount "ro" (which is what we do later in the guest) is truthfully the right thing, + // but this bit here is just a tad awkward. + var modifiedRootfs = self.rootfs + modifiedRootfs.options.removeAll(where: { $0 == "ro" }) + let vmConfig = VMConfiguration( cpus: self.cpus, memoryInBytes: self.memoryInBytes, interfaces: self.interfaces, - mountsByID: [self.id: [self.rootfs] + self.config.mounts], + mountsByID: [self.id: [modifiedRootfs] + self.config.mounts], bootLog: self.config.bootLog, nestedVirtualization: self.config.virtualization ) diff --git a/Sources/Containerization/LinuxPod.swift b/Sources/Containerization/LinuxPod.swift index 670f8f89..a3be9f19 100644 --- a/Sources/Containerization/LinuxPod.swift +++ b/Sources/Containerization/LinuxPod.swift @@ -195,7 +195,7 @@ public final class LinuxPod: Sendable { ) } - private func generateRuntimeSpec(containerID: String, config: ContainerConfiguration) -> Spec { + private func generateRuntimeSpec(containerID: String, config: ContainerConfiguration, rootfs: Mount) -> Spec { var spec = Self.createDefaultRuntimeSpec(containerID, podID: self.id) // Process configuration @@ -207,6 +207,10 @@ public final class LinuxPod: Sendable { // Linux toggles spec.linux?.sysctl = config.sysctl + // If the rootfs was requested as read-only, set it in the OCI spec. + // We let the OCI runtime remount as ro, instead of doing it originally. + spec.root?.readonly = rootfs.options.contains("ro") + // Resource limits (if specified) if let cpus = config.cpus, cpus > 0 { spec.linux?.resources?.cpu = LinuxCPU( @@ -287,9 +291,13 @@ extension LinuxPod { try state.phase.validateForCreate() // Build mountsByID for all containers. + // Strip "ro" from rootfs options - we handle readonly via the OCI spec's + // root.readonly field and remount in vmexec after setup is complete. var mountsByID: [String: [Mount]] = [:] for (id, container) in state.containers { - mountsByID[id] = [container.rootfs] + container.config.mounts + var modifiedRootfs = container.rootfs + modifiedRootfs.options.removeAll(where: { $0 == "ro" }) + mountsByID[id] = [modifiedRootfs] + container.config.mounts } let vmConfig = VMConfiguration( @@ -450,7 +458,7 @@ extension LinuxPod { let agent = try await createdState.vm.dialAgent() do { - var spec = self.generateRuntimeSpec(containerID: containerID, config: container.config) + var spec = self.generateRuntimeSpec(containerID: containerID, config: container.config, rootfs: container.rootfs) // We don't need the rootfs, nor do OCI runtimes want it included. let containerMounts = createdState.vm.mounts[containerID] ?? [] spec.mounts = containerMounts.dropFirst().map { $0.to } @@ -685,7 +693,7 @@ extension LinuxPod { ) } - var spec = self.generateRuntimeSpec(containerID: containerID, config: container.config) + var spec = self.generateRuntimeSpec(containerID: containerID, config: container.config, rootfs: container.rootfs) var config = LinuxProcessConfiguration() try configuration(&config) spec.process = config.toOCI() diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index e5b0cad0..2ed462a8 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -1436,4 +1436,96 @@ extension IntegrationSuite { throw IntegrationError.assert(msg: "container with CAP_CHOWN should succeed, got exit code \(status.exitCode)") } } + + func testReadOnlyRootfs() async throws { + let id = "test-readonly-rootfs" + + let bs = try await bootstrap(id) + var rootfs = bs.rootfs + rootfs.options.append("ro") + let container = try LinuxContainer(id, rootfs: rootfs, vmm: bs.vmm) { config in + config.process.arguments = ["touch", "/testfile"] + config.bootLog = bs.bootLog + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + // touch should fail on a read-only rootfs + guard status.exitCode != 0 else { + throw IntegrationError.assert(msg: "touch should have failed on read-only rootfs") + } + } + + func testReadOnlyRootfsHostsFileWritten() async throws { + let id = "test-readonly-rootfs-hosts" + + let bs = try await bootstrap(id) + var rootfs = bs.rootfs + rootfs.options.append("ro") + let buffer = BufferWriter() + let entry = Hosts.Entry.localHostIPV4(comment: "ReadOnlyTest") + let container = try LinuxContainer(id, rootfs: rootfs, vmm: bs.vmm) { config in + // Verify /etc/hosts was written before rootfs was remounted read-only + config.process.arguments = ["cat", "/etc/hosts"] + config.process.stdout = buffer + config.hosts = Hosts(entries: [entry]) + config.bootLog = bs.bootLog + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "cat /etc/hosts failed with status \(status)") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert stdout to UTF8") + } + + guard output.contains("ReadOnlyTest") else { + throw IntegrationError.assert(msg: "expected /etc/hosts to contain our entry, got: \(output)") + } + } + + func testReadOnlyRootfsDNSConfigured() async throws { + let id = "test-readonly-rootfs-dns" + + let bs = try await bootstrap(id) + var rootfs = bs.rootfs + rootfs.options.append("ro") + let buffer = BufferWriter() + let container = try LinuxContainer(id, rootfs: rootfs, vmm: bs.vmm) { config in + // Verify /etc/resolv.conf was written before rootfs was remounted read-only + config.process.arguments = ["cat", "/etc/resolv.conf"] + config.process.stdout = buffer + config.dns = DNS(nameservers: ["8.8.8.8", "8.8.4.4"]) + config.bootLog = bs.bootLog + } + + try await container.create() + try await container.start() + + let status = try await container.wait() + try await container.stop() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "cat /etc/resolv.conf failed with status \(status)") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert stdout to UTF8") + } + + guard output.contains("8.8.8.8") && output.contains("8.8.4.4") else { + throw IntegrationError.assert(msg: "expected /etc/resolv.conf to contain DNS servers, got: \(output)") + } + } } diff --git a/Sources/Integration/PodTests.swift b/Sources/Integration/PodTests.swift index 79708bd7..72ffeec2 100644 --- a/Sources/Integration/PodTests.swift +++ b/Sources/Integration/PodTests.swift @@ -746,4 +746,71 @@ extension IntegrationSuite { throw IntegrationError.assert(msg: "ps output should contain 'sleep 300', got: '\(output)'") } } + + func testPodReadOnlyRootfs() async throws { + let id = "test-pod-readonly-rootfs" + + let bs = try await bootstrap(id) + var rootfs = bs.rootfs + rootfs.options.append("ro") + let pod = try LinuxPod(id, vmm: bs.vmm) { config in + config.cpus = 4 + config.memoryInBytes = 1024.mib() + config.bootLog = bs.bootLog + } + + try await pod.addContainer("container1", rootfs: rootfs) { config in + config.process.arguments = ["touch", "/testfile"] + } + + try await pod.create() + try await pod.startContainer("container1") + + let status = try await pod.waitContainer("container1") + try await pod.stop() + + // touch should fail on a read-only rootfs + guard status.exitCode != 0 else { + throw IntegrationError.assert(msg: "touch should have failed on read-only rootfs") + } + } + + func testPodReadOnlyRootfsDNSConfigured() async throws { + let id = "test-pod-readonly-rootfs-dns" + + let bs = try await bootstrap(id) + var rootfs = bs.rootfs + rootfs.options.append("ro") + let pod = try LinuxPod(id, vmm: bs.vmm) { config in + config.cpus = 4 + config.memoryInBytes = 1024.mib() + config.bootLog = bs.bootLog + config.dns = DNS(nameservers: ["8.8.8.8", "8.8.4.4"]) + } + + let buffer = BufferWriter() + try await pod.addContainer("container1", rootfs: rootfs) { config in + // Verify /etc/resolv.conf was written before rootfs was remounted read-only + config.process.arguments = ["cat", "/etc/resolv.conf"] + config.process.stdout = buffer + } + + try await pod.create() + try await pod.startContainer("container1") + + let status = try await pod.waitContainer("container1") + try await pod.stop() + + guard status.exitCode == 0 else { + throw IntegrationError.assert(msg: "cat /etc/resolv.conf failed with status \(status)") + } + + guard let output = String(data: buffer.data, encoding: .utf8) else { + throw IntegrationError.assert(msg: "failed to convert stdout to UTF8") + } + + guard output.contains("8.8.8.8") && output.contains("8.8.4.4") else { + throw IntegrationError.assert(msg: "expected /etc/resolv.conf to contain DNS servers, got: \(output)") + } + } } diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 3d2747b3..e6e4a8ab 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -305,6 +305,9 @@ struct IntegrationSuite: AsyncParsableCommand { Test("container capabilities OCI default", testCapabilitiesOCIDefault), Test("container capabilities all capabilities", testCapabilitiesAllCapabilities), Test("container capabilities file ownership", testCapabilitiesFileOwnership), + Test("container read-only rootfs", testReadOnlyRootfs), + Test("container read-only rootfs hosts file", testReadOnlyRootfsHostsFileWritten), + Test("container read-only rootfs DNS", testReadOnlyRootfsDNSConfigured), // Pods Test("pod single container", testPodSingleContainer), @@ -321,6 +324,8 @@ struct IntegrationSuite: AsyncParsableCommand { Test("pod container PID namespace isolation", testPodContainerPIDNamespaceIsolation), Test("pod container independent resource limits", testPodContainerIndependentResourceLimits), Test("pod shared PID namespace", testPodSharedPIDNamespace), + Test("pod read-only rootfs", testPodReadOnlyRootfs), + Test("pod read-only rootfs DNS", testPodReadOnlyRootfsDNSConfigured), ] let passed: Atomic = Atomic(0) diff --git a/Sources/cctl/RunCommand.swift b/Sources/cctl/RunCommand.swift index 47923816..82b2aee3 100644 --- a/Sources/cctl/RunCommand.swift +++ b/Sources/cctl/RunCommand.swift @@ -62,6 +62,9 @@ extension Application { @Option(name: .long, help: "Path to OCI runtime to use for spawning the container") var ociRuntimePath: String? + @Flag(name: .long, help: "Make rootfs readonly") + var readOnly: Bool = false + @Option( name: [.customLong("kernel"), .customShort("k")], help: "Kernel binary path", completion: .file(), transform: { str in @@ -94,7 +97,8 @@ extension Application { let container = try await manager.create( id, reference: imageReference, - rootfsSizeInBytes: fsSizeInMB.mib() + rootfsSizeInBytes: fsSizeInMB.mib(), + readOnly: readOnly ) { config in config.cpus = cpus config.memoryInBytes = memory.mib() diff --git a/vminitd/Sources/vmexec/RunCommand.swift b/vminitd/Sources/vmexec/RunCommand.swift index 779f1c5b..3ab5e0bc 100644 --- a/vminitd/Sources/vmexec/RunCommand.swift +++ b/vminitd/Sources/vmexec/RunCommand.swift @@ -58,9 +58,34 @@ struct RunCommand: ParsableCommand { try setDevSymlinks(rootfs: rootfs.path) try pivotRoot(rootfs: rootfs.path) + + // Remount ro if requested. + if rootfs.readonly { + try self.remountRootfsReadOnly() + } + try reOpenDevNull() } + private func remountRootfsReadOnly() throws { + var flags = UInt(MS_BIND | MS_REMOUNT | MS_RDONLY) + + let ret = mount("", "/", "", flags, "") + if ret == 0 { + return + } + + var s = statfs() + guard statfs("/", &s) == 0 else { + throw App.Errno(stage: "statfs(/)") + } + flags |= s.f_flags + + guard mount("", "/", "", flags, "") == 0 else { + throw App.Errno(stage: "mount rootfs ro") + } + } + private func childSetup( spec: ContainerizationOCI.Spec, ackPipe: FileHandle,