diff --git a/api/src/syscall/mod.rs b/api/src/syscall/mod.rs index a0e0e251..da1ff60a 100644 --- a/api/src/syscall/mod.rs +++ b/api/src/syscall/mod.rs @@ -428,6 +428,11 @@ pub fn handle_syscall(uctx: &mut UserContext) { uctx.arg3(), uctx.arg4(), ), + Sysno::clone3 => sys_clone3( + uctx, + uctx.arg0() as _, // args_ptr + uctx.arg1() as _, // args_size + ), #[cfg(target_arch = "x86_64")] Sysno::fork => sys_fork(uctx), Sysno::exit => sys_exit(uctx.arg0() as _), diff --git a/api/src/syscall/task/clone.rs b/api/src/syscall/task/clone.rs index 06b50e2c..ee34ff7f 100644 --- a/api/src/syscall/task/clone.rs +++ b/api/src/syscall/task/clone.rs @@ -21,114 +21,187 @@ use crate::{ }; bitflags! { - /// Options for use with [`sys_clone`]. + /// Options for use with [`sys_clone`] and [`sys_clone3`]. #[derive(Debug, Clone, Copy, Default)] - struct CloneFlags: u32 { - /// The calling process and the child process run in the same - /// memory space. - const VM = CLONE_VM; - /// The caller and the child process share the same filesystem - /// information. - const FS = CLONE_FS; - /// The calling process and the child process share the same file - /// descriptor table. - const FILES = CLONE_FILES; - /// The calling process and the child process share the same table - /// of signal handlers. - const SIGHAND = CLONE_SIGHAND; + pub struct CloneFlags: u64 { + /// The calling process and the child process run in the same memory space. + const VM = CLONE_VM as u64; + /// The caller and the child process share the same filesystem information. + const FS = CLONE_FS as u64; + /// The calling process and the child process share the same file descriptor table. + const FILES = CLONE_FILES as u64; + /// The calling process and the child process share the same table of signal handlers. + const SIGHAND = CLONE_SIGHAND as u64; /// Sets pidfd to the child process's PID file descriptor. - const PIDFD = CLONE_PIDFD; - /// If the calling process is being traced, then trace the child - /// also. - const PTRACE = CLONE_PTRACE; - /// The execution of the calling process is suspended until the - /// child releases its virtual memory resources via a call to - /// execve(2) or _exit(2) (as with vfork(2)). - const VFORK = CLONE_VFORK; - /// The parent of the new child (as returned by getppid(2)) - /// will be the same as that of the calling process. - const PARENT = CLONE_PARENT; - /// The child is placed in the same thread group as the calling - /// process. - const THREAD = CLONE_THREAD; + const PIDFD = CLONE_PIDFD as u64; + /// If the calling process is being traced, then trace the child also. + const PTRACE = CLONE_PTRACE as u64; + /// The execution of the calling process is suspended until the child releases + /// its virtual memory resources via a call to execve(2) or _exit(2) (as with vfork(2)). + const VFORK = CLONE_VFORK as u64; + /// The parent of the new child (as returned by getppid(2)) will be the same + /// as that of the calling process. + const PARENT = CLONE_PARENT as u64; + /// The child is placed in the same thread group as the calling process. + const THREAD = CLONE_THREAD as u64; /// The cloned child is started in a new mount namespace. - const NEWNS = CLONE_NEWNS; - /// The child and the calling process share a single list of System - /// V semaphore adjustment values - const SYSVSEM = CLONE_SYSVSEM; + const NEWNS = CLONE_NEWNS as u64; + /// The child and the calling process share a single list of System V + /// semaphore adjustment values. + const SYSVSEM = CLONE_SYSVSEM as u64; /// The TLS (Thread Local Storage) descriptor is set to tls. - const SETTLS = CLONE_SETTLS; + const SETTLS = CLONE_SETTLS as u64; /// Store the child thread ID in the parent's memory. - const PARENT_SETTID = CLONE_PARENT_SETTID; - /// Clear (zero) the child thread ID in child memory when the child - /// exits, and do a wakeup on the futex at that address. - const CHILD_CLEARTID = CLONE_CHILD_CLEARTID; - /// A tracing process cannot force `CLONE_PTRACE` on this child - /// process. - const UNTRACED = CLONE_UNTRACED; + const PARENT_SETTID = CLONE_PARENT_SETTID as u64; + /// Clear (zero) the child thread ID in child memory when the child exits, + /// and do a wakeup on the futex at that address. + const CHILD_CLEARTID = CLONE_CHILD_CLEARTID as u64; + /// A tracing process cannot force `CLONE_PTRACE` on this child process. + const UNTRACED = CLONE_UNTRACED as u64; /// Store the child thread ID in the child's memory. - const CHILD_SETTID = CLONE_CHILD_SETTID; + const CHILD_SETTID = CLONE_CHILD_SETTID as u64; /// Create the process in a new cgroup namespace. - const NEWCGROUP = CLONE_NEWCGROUP; + const NEWCGROUP = CLONE_NEWCGROUP as u64; /// Create the process in a new UTS namespace. - const NEWUTS = CLONE_NEWUTS; + const NEWUTS = CLONE_NEWUTS as u64; /// Create the process in a new IPC namespace. - const NEWIPC = CLONE_NEWIPC; + const NEWIPC = CLONE_NEWIPC as u64; /// Create the process in a new user namespace. - const NEWUSER = CLONE_NEWUSER; + const NEWUSER = CLONE_NEWUSER as u64; /// Create the process in a new PID namespace. - const NEWPID = CLONE_NEWPID; + const NEWPID = CLONE_NEWPID as u64; /// Create the process in a new network namespace. - const NEWNET = CLONE_NEWNET; + const NEWNET = CLONE_NEWNET as u64; /// The new process shares an I/O context with the calling process. - const IO = CLONE_IO; + const IO = CLONE_IO as u64; + /// Clear signal handlers on clone (since Linux 5.5). + const CLEAR_SIGHAND = 0x100000000u64; + /// Clone into specific cgroup (since Linux 5.7). + const INTO_CGROUP = 0x200000000u64; } } -pub fn sys_clone( - uctx: &UserContext, - flags: u32, - stack: usize, - parent_tid: usize, - #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, - tls: usize, - #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, -) -> AxResult { - const FLAG_MASK: u32 = 0xff; - let exit_signal = flags & FLAG_MASK; - let mut flags = CloneFlags::from_bits_truncate(flags & !FLAG_MASK); - if flags.contains(CloneFlags::VFORK) { - debug!("sys_clone: CLONE_VFORK slow path"); - flags.remove(CloneFlags::VM); +/// Unified arguments for clone/clone3/fork/vfork. +/// +/// This structure is used internally to homogenize parameters from different +/// clone syscall variants (clone, clone3, fork, vfork). +#[derive(Debug, Clone, Copy, Default)] +pub struct CloneArgs { + pub flags: CloneFlags, + pub exit_signal: u64, + pub stack: usize, + pub tls: usize, + pub parent_tid: usize, + pub child_tid: usize, + pub pidfd: usize, +} + +impl CloneArgs { + /// Create CloneArgs from clone() syscall parameters. + /// + /// Note: In clone(), the parent_tid parameter serves dual purpose: + /// - If CLONE_PIDFD: receives the pidfd + /// - If CLONE_PARENT_SETTID: receives the child TID + /// These two flags are mutually exclusive. + pub fn from_clone( + raw_flags: u32, + stack: usize, + parent_tid: usize, + child_tid: usize, + tls: usize, + ) -> AxResult { + const FLAG_MASK: u32 = 0xff; + let flags = CloneFlags::from_bits_truncate((raw_flags & !FLAG_MASK) as u64); + let exit_signal = (raw_flags & FLAG_MASK) as u64; + + if flags.contains(CloneFlags::PIDFD | CloneFlags::PARENT_SETTID) { + return Err(AxError::InvalidInput); + } + + Ok(Self { + flags, + exit_signal, + stack, + tls, + parent_tid, + child_tid, + pidfd: 0, + }) } +} - debug!( - "sys_clone <= flags: {flags:?}, exit_signal: {exit_signal}, stack: {stack:#x}, ptid: \ - {parent_tid:#x}, ctid: {child_tid:#x}, tls: {tls:#x}" - ); +fn validate_common(args: &CloneArgs) -> AxResult<()> { + let flags = args.flags; + let exit_signal = args.exit_signal; - if exit_signal != 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { + if exit_signal > 0 && flags.contains(CloneFlags::THREAD | CloneFlags::PARENT) { return Err(AxError::InvalidInput); } if flags.contains(CloneFlags::THREAD) && !flags.contains(CloneFlags::VM | CloneFlags::SIGHAND) { return Err(AxError::InvalidInput); } - if flags.contains(CloneFlags::PIDFD | CloneFlags::PARENT_SETTID) { + if flags.contains(CloneFlags::SIGHAND) && !flags.contains(CloneFlags::VM) { + return Err(AxError::InvalidInput); + } + if flags.contains(CloneFlags::VFORK) && flags.contains(CloneFlags::THREAD) { + return Err(AxError::InvalidInput); + } + if exit_signal >= 64 { return Err(AxError::InvalidInput); } - let exit_signal = Signo::from_repr(exit_signal as u8); + + let namespace_flags = CloneFlags::NEWNS + | CloneFlags::NEWIPC + | CloneFlags::NEWNET + | CloneFlags::NEWPID + | CloneFlags::NEWUSER + | CloneFlags::NEWUTS + | CloneFlags::NEWCGROUP; + + if flags.intersects(namespace_flags) { + warn!( + "sys_clone/sys_clone3: namespace flags detected ({:?}), stub support only", + flags & namespace_flags + ); + } + + Ok(()) +} + +/// Core implementation of clone/clone3/fork/vfork. +pub fn do_clone(uctx: &UserContext, args: CloneArgs) -> AxResult { + validate_common(&args)?; + + let mut flags = args.flags; + let exit_signal = args.exit_signal; + + if flags.contains(CloneFlags::VFORK) { + debug!("do_clone: CLONE_VFORK slow path"); + flags.remove(CloneFlags::VM); + } + + debug!( + "do_clone <= flags: {:?}, exit_signal: {}, stack: {:#x}, tls: {:#x}", + flags, exit_signal, args.stack, args.tls + ); + + let exit_signal = if exit_signal > 0 { + Signo::from_repr(exit_signal as u8) + } else { + None + }; let mut new_uctx = *uctx; - if stack != 0 { - new_uctx.set_sp(stack); + if args.stack != 0 { + new_uctx.set_sp(args.stack); } if flags.contains(CloneFlags::SETTLS) { - new_uctx.set_tls(tls); + new_uctx.set_tls(args.tls); } new_uctx.set_retval(0); let set_child_tid = if flags.contains(CloneFlags::CHILD_SETTID) { - child_tid + args.child_tid } else { 0 }; @@ -139,8 +212,8 @@ pub fn sys_clone( let mut new_task = new_user_task(&curr.name(), new_uctx, set_child_tid); let tid = new_task.id().as_u64() as Pid; - if flags.contains(CloneFlags::PARENT_SETTID) { - (parent_tid as *mut Pid).vm_write(tid).ok(); + if flags.contains(CloneFlags::PARENT_SETTID) && args.parent_tid != 0 { + (args.parent_tid as *mut Pid).vm_write(tid)?; } let new_proc_data = if flags.contains(CloneFlags::THREAD) { @@ -170,9 +243,12 @@ pub fn sys_clone( let signal_actions = if flags.contains(CloneFlags::SIGHAND) { old_proc_data.signal.actions.clone() + } else if flags.contains(CloneFlags::CLEAR_SIGHAND) { + Arc::new(SpinNoIrq::new(Default::default())) } else { Arc::new(SpinNoIrq::new(old_proc_data.signal.actions.lock().clone())) }; + let proc_data = ProcessData::new( proc, old_proc_data.exe_path.read().clone(), @@ -182,7 +258,6 @@ pub fn sys_clone( exit_signal, ); proc_data.set_umask(old_proc_data.umask()); - // Inherit heap pointers from parent to ensure child's heap state is consistent after fork proc_data.set_heap_top(old_proc_data.get_heap_top()); { @@ -213,12 +288,20 @@ pub fn sys_clone( if flags.contains(CloneFlags::PIDFD) { let pidfd = PidFd::new(&new_proc_data); - (parent_tid as *mut i32).vm_write(pidfd.add_to_fd_table(true)?)?; + let fd = pidfd.add_to_fd_table(true)?; + let target = if args.pidfd != 0 { + args.pidfd + } else { + args.parent_tid + }; + if target != 0 { + (target as *mut i32).vm_write(fd)?; + } } let thr = Thread::new(tid, new_proc_data); - if flags.contains(CloneFlags::CHILD_CLEARTID) { - thr.set_clear_child_tid(child_tid); + if flags.contains(CloneFlags::CHILD_CLEARTID) && args.child_tid != 0 { + thr.set_clear_child_tid(args.child_tid); } *new_task.task_ext_mut() = Some(unsafe { AxTaskExt::from_impl(thr) }); @@ -228,6 +311,19 @@ pub fn sys_clone( Ok(tid as _) } +pub fn sys_clone( + uctx: &UserContext, + flags: u32, + stack: usize, + parent_tid: usize, + #[cfg(any(target_arch = "x86_64", target_arch = "loongarch64"))] child_tid: usize, + tls: usize, + #[cfg(not(any(target_arch = "x86_64", target_arch = "loongarch64")))] child_tid: usize, +) -> AxResult { + let args = CloneArgs::from_clone(flags, stack, parent_tid, child_tid, tls)?; + do_clone(uctx, args) +} + #[cfg(target_arch = "x86_64")] pub fn sys_fork(uctx: &UserContext) -> AxResult { sys_clone(uctx, SIGCHLD, 0, 0, 0, 0) diff --git a/api/src/syscall/task/clone3.rs b/api/src/syscall/task/clone3.rs new file mode 100644 index 00000000..755d8a70 --- /dev/null +++ b/api/src/syscall/task/clone3.rs @@ -0,0 +1,78 @@ +use axerrno::{AxError, AxResult}; +use axhal::uspace::UserContext; +use starry_vm::VmPtr; + +use super::clone::{CloneArgs, CloneFlags, do_clone}; + +/// Structure passed to clone3() system call. +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct Clone3Args { + pub flags: u64, + pub pidfd: u64, + pub child_tid: u64, + pub parent_tid: u64, + pub exit_signal: u64, + pub stack: u64, + pub stack_size: u64, + pub tls: u64, + pub set_tid: u64, + pub set_tid_size: u64, + pub cgroup: u64, +} + +const MIN_CLONE_ARGS_SIZE: usize = core::mem::size_of::() * 8; + +impl Clone3Args { + fn into_clone_args(self) -> AxResult { + if self.set_tid != 0 || self.set_tid_size != 0 { + warn!("sys_clone3: set_tid/set_tid_size not supported, ignoring"); + } + if self.cgroup != 0 { + warn!("sys_clone3: cgroup parameter not supported, ignoring"); + } + + let flags = CloneFlags::from_bits_truncate(self.flags); + + let stack = if self.stack > 0 { + if self.stack_size > 0 { + (self.stack + self.stack_size) as usize + } else { + self.stack as usize + } + } else { + 0 + }; + + Ok(CloneArgs { + flags, + exit_signal: self.exit_signal, + stack, + tls: self.tls as usize, + parent_tid: self.parent_tid as usize, + child_tid: self.child_tid as usize, + pidfd: self.pidfd as usize, + }) + } +} + +pub fn sys_clone3(uctx: &UserContext, args_ptr: usize, args_size: usize) -> AxResult { + debug!("sys_clone3 <= args_ptr: {args_ptr:#x}, args_size: {args_size}"); + + if args_size < MIN_CLONE_ARGS_SIZE { + warn!("sys_clone3: args_size {args_size} too small, minimum is {MIN_CLONE_ARGS_SIZE}"); + return Err(AxError::InvalidInput); + } + + if args_size > core::mem::size_of::() { + debug!("sys_clone3: args_size {args_size} larger than expected, using known fields only"); + } + + let args_ptr = args_ptr as *const Clone3Args; + let clone3_args = unsafe { args_ptr.vm_read_uninit()?.assume_init() }; + + debug!("sys_clone3: args = {clone3_args:?}"); + + let args = clone3_args.into_clone_args()?; + do_clone(uctx, args) +} diff --git a/api/src/syscall/task/mod.rs b/api/src/syscall/task/mod.rs index a6e77afc..2143a0e8 100644 --- a/api/src/syscall/task/mod.rs +++ b/api/src/syscall/task/mod.rs @@ -1,4 +1,5 @@ mod clone; +mod clone3; mod ctl; mod execve; mod exit; @@ -7,4 +8,6 @@ mod schedule; mod thread; mod wait; -pub use self::{clone::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*}; +pub use self::{ + clone::*, clone3::*, ctl::*, execve::*, exit::*, job::*, schedule::*, thread::*, wait::*, +};