//
// Syd: rock-solid application kernel
// src/workers/emu.rs: `syd_emu' emulator threads
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
// Based in part upon rusty_pool which is:
//     Copyright (c) Robin Friedli <robinfriedli@icloud.com>
//     SPDX-License-Identifier: Apache-2.0
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    mem::MaybeUninit,
    option::Option,
    os::fd::RawFd,
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc, RwLock,
    },
    thread,
};

#[allow(deprecated)]
use libc::SOCK_PACKET;
use libc::{AF_ALG, AF_INET, AF_INET6, AF_NETLINK, AF_UNIX, EACCES, SOCK_RAW};
use libseccomp::{scmp_cmp, RawSyscall, ScmpAction, ScmpArch, ScmpFilterContext, ScmpSyscall};
use libseccomp_sys::{const_scmp_filter_ctx, seccomp_load, seccomp_notify_receive};
use nix::{
    errno::Errno,
    fcntl::OFlag,
    sched::{unshare, CloneFlags},
    unistd::{close, Gid, Uid},
};

#[cfg(target_arch = "x86")]
use crate::cookie::FTRUNCATE64_COOKIE_ARG3;
#[cfg(target_arch = "x86")]
use crate::cookie::TRUNCATE64_COOKIE_ARG3;
use crate::{
    alert,
    config::*,
    confine::{
        confine_scmp_clone, confine_scmp_clone3, extend_ioctl, scmp_add_setid_rules, scmp_arch_raw,
        ScmpNotifReq, SydArch, Sydcall, EIDRM, EOWNERDEAD, X32_SYSCALL_BIT,
    },
    cookie::{
        ACCEPT4_COOKIE_ARG4, ACCEPT4_COOKIE_ARG5, BIND_COOKIE_ARG3, BIND_COOKIE_ARG4,
        BIND_COOKIE_ARG5, CONNECT_COOKIE_ARG3, CONNECT_COOKIE_ARG4, CONNECT_COOKIE_ARG5,
        FTRUNCATE64_COOKIE_ARG4, FTRUNCATE64_COOKIE_ARG5, FTRUNCATE_COOKIE_ARG2,
        FTRUNCATE_COOKIE_ARG3, FTRUNCATE_COOKIE_ARG4, FTRUNCATE_COOKIE_ARG5, LINKAT_COOKIE_ARG5,
        MEMFD_CREATE_COOKIE_ARG2, MEMFD_CREATE_COOKIE_ARG3, MEMFD_CREATE_COOKIE_ARG4,
        MEMFD_CREATE_COOKIE_ARG5, OPENAT2_COOKIE_ARG4, OPENAT2_COOKIE_ARG5, RENAMEAT2_COOKIE_ARG5,
        SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG3, SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG4,
        SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG5, SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG3,
        SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG4, SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG5,
        SOCKET_COOKIE_ARG3, SOCKET_COOKIE_ARG4, SOCKET_COOKIE_ARG5, SYS_ACCEPT4, SYS_BIND,
        SYS_CONNECT, SYS_SOCKET, TRUNCATE64_COOKIE_ARG4, TRUNCATE64_COOKIE_ARG5,
        TRUNCATE_COOKIE_ARG2, TRUNCATE_COOKIE_ARG3, TRUNCATE_COOKIE_ARG4, TRUNCATE_COOKIE_ARG5,
        UNLINKAT_COOKIE_ARG3, UNLINKAT_COOKIE_ARG4, UNLINKAT_COOKIE_ARG5,
    },
    err::{err2no, SydJoinHandle, SydResult},
    fs::{
        seccomp_notify_respond, AT_EXECVE_CHECK, SECCOMP_IOCTL_NOTIF_ADDFD,
        SECCOMP_IOCTL_NOTIF_LIST, SECCOMP_IOCTL_NOTIF_SEND,
    },
    hook::{HandlerMap, UNotifyEventRequest},
    info,
    path::dotdot_with_nul,
    proc::proc_mmap,
    sandbox::{Flags, Sandbox, SandboxGuard},
    timer::AlarmTimer,
    workers::{aes::AesMap, UnixMap, WorkerCache, WorkerData},
};

#[derive(Clone)]
pub(crate) struct Worker {
    fd: RawFd,
    flags: Flags,
    cache: Arc<WorkerCache<'static>>,
    sandbox: Arc<RwLock<Sandbox>>,
    handlers: Arc<HandlerMap>,
    keep_alive: Option<u16>,
    should_exit: Arc<AtomicBool>,
    worker_data: Arc<WorkerData>,
    crypt_map: Option<AesMap>,
    unix_map: UnixMap,
}

impl Worker {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        fd: RawFd,
        cache: Arc<WorkerCache<'static>>,
        sandbox: Arc<RwLock<Sandbox>>,
        handlers: Arc<HandlerMap>,
        keep_alive: Option<u16>,
        should_exit: Arc<AtomicBool>,
        worker_data: Arc<WorkerData>,
        crypt_map: Option<AesMap>,
        unix_map: UnixMap,
    ) -> Self {
        let my_sandbox = SandboxGuard::Read(sandbox.read().unwrap_or_else(|err| err.into_inner()));
        let flags = *my_sandbox.flags;
        drop(my_sandbox); // release the read lock.

        Worker {
            fd,
            flags,
            cache,
            sandbox,
            handlers,
            keep_alive,
            should_exit,
            worker_data,
            crypt_map,
            unix_map,
        }
    }

    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(
        self,
        ctx: Option<&ScmpFilterContext>,
    ) -> Result<SydJoinHandle<()>, Errno> {
        // SAFETY: ScmpFilterContext is not Send,
        // so we cannot pass it between threads.
        // Therefore we pass a pointer which is
        // owned by the monitor thread. This
        // pointer is guaranteed to be valid
        // throughout Syd's lifetime.
        let mut ctx = ctx.map(|ctx| ctx.as_ptr() as usize);

        thread::Builder::new()
            .name("syd_emu".to_string())
            .stack_size(EMU_STACK_SIZE)
            .spawn(move || {
                // Unshare:
                // 1. CLONE_FS so cwd and umask are per-thread.
                // 2. CLONE_FILES so file descriptor table is per-thread.
                //
                // Note, we cannot do 2 if Crypt sanboxing is on because
                // emulator threads have to share memory fds with AES
                // threads.
                let mut unshare_flags = CloneFlags::CLONE_FS;
                let is_crypt = self.crypt_map.is_some();
                if !is_crypt {
                    unshare_flags.insert(CloneFlags::CLONE_FILES);
                }

                // SAFETY: We use exit_group(2) here to bail,
                // because this unsharing is a critical safety feature.
                if let Err(errno) = unshare(unshare_flags) {
                    alert!("ctx": "boot", "op": "unshare_emulator_thread",
                        "msg": format!("failed to unshare({unshare_flags:?}): {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // SAFETY: We use exit_group(2) here to bail,
                // because expiring idle threads is a critical safety feature.
                let mut timer = if let Some(keep_alive) = self.keep_alive {
                    match AlarmTimer::from_seconds(keep_alive.into()) {
                        Ok(timer) => Some(timer),
                        Err(errno) => {
                            alert!("ctx": "boot", "op": "timer_create_for_emulator_thread",
                                "msg": format!("failed to set up timer: {errno}"),
                                "err": errno as i32);
                            std::process::exit(101);
                        }
                    }
                } else {
                    None
                };

                // Create sentinel, that will handle graceful teardown.
                let mut sentinel = Sentinel::new(&self);

                // Thread successfully started, increment total worker count.
                self.worker_data.increment_worker_total();

                loop {
                    // Confine and drop filter if sandbox is locked.
                    if let Some(filter) = ctx {
                        if Sandbox::locked_once() {
                            // SAFETY: filter pointer is owned by the
                            // monitor thread and is valid for Syd's
                            // lifetime.
                            let error = unsafe { seccomp_load(filter as const_scmp_filter_ctx) };

                            // SAFETY: We use exit_group(2) here to bail,
                            // because this confinement is a critical safety feature.
                            if error != 0 {
                                let errno = Errno::from_raw(error.abs());
                                alert!("ctx": "boot", "op": "confine_emulator_thread",
                                    "msg": format!("failed to confine: {error}"),
                                    "err": errno as i32);
                                std::process::exit(101);
                            }

                            let safe_setid = self.flags.intersects(
                                Flags::FL_ALLOW_SAFE_SETUID | Flags::FL_ALLOW_SAFE_SETGID,
                            );
                            info!("ctx": "confine", "op": "confine_emulator_thread",
                                "msg": format!("emulator thread confined with{} SROP mitigation",
                                    if safe_setid { "out" } else { "" }));

                            ctx = None;
                        }
                    }

                    // Receive seccomp notification.
                    let request = if let Ok(request) = self.receive(&mut timer) {
                        request
                    } else {
                        // Critical error, decrement worker total and exit.
                        self.worker_data.decrement_worker_total();
                        break;
                    };

                    if let Some(request) = request {
                        // Mark thread busy.
                        sentinel.seccomp_id = Some(request.id);
                        self.worker_data.increment_worker_busy();

                        // Handle request.
                        self.handle(request);

                        // Mark thread idle again.
                        sentinel.seccomp_id = None;
                        self.worker_data.decrement_worker_busy();
                    } // else process died-midway, continue.

                    // Exit if there's nothing else to handle.
                    if self.should_exit.load(Ordering::Relaxed) {
                        // Time to exit.
                        break;
                    }
                }

                Ok(())
            })
            .map_err(|err| err2no(&err))
    }

    fn receive(&self, timer: &mut Option<AlarmTimer>) -> Result<Option<ScmpNotifReq>, Errno> {
        // Receive and return request.
        // Break if file descriptor was closed.
        // Ignore rest of the errors as we cannot handle them,
        // e.g: EINTR|ENOENT: task is killed mid-way.
        match self.read(timer) {
            Ok(request) => Ok(Some(request)),
            Err(Errno::EBADF) => Err(Errno::EBADF),
            Err(Errno::EINTR) if timer.is_some() => Err(Errno::EINTR),
            Err(_) => Ok(None),
        }
    }

    fn read(&self, timer: &mut Option<AlarmTimer>) -> Result<ScmpNotifReq, Errno> {
        // Use libc::seccomp_notif rather than libseccomp_sys's.
        // The latter is opaque and requires us to do a heap
        // allocation which we don't always want.
        let mut req: MaybeUninit<libc::seccomp_notif> = MaybeUninit::zeroed();

        if let Some(timer) = timer {
            // SAFETY: Something is awfully wrong if we cannot
            // set the timer so we panic here to indicate this.
            #[allow(clippy::disallowed_methods)]
            timer.start().expect("timer_settime");
        }

        // SAFETY: libseccomp's wrapper allocates each call.
        // Note: EINTR may also mean child killed by signal!
        let res =
            Errno::result(unsafe { seccomp_notify_receive(self.fd, req.as_mut_ptr().cast()) });

        if let Some(timer) = timer {
            // SAFETY: See above.
            #[allow(clippy::disallowed_methods)]
            timer.stop().expect("timer_settime");
        }
        res?;

        // SAFETY: seccomp_notify_receive returned success.
        // Request is populated and accessing it is safe.
        let req = ScmpNotifReq::from_sys(unsafe { req.assume_init() })?;

        if req.id != 0 && req.pid != 0 {
            Ok(req)
        } else {
            // interrupted/task killed mid-way.
            Err(Errno::EINTR)
        }
    }

    #[allow(clippy::cognitive_complexity)]
    fn handle(&self, mut req: ScmpNotifReq) {
        // Correct architecture for x32 syscalls.
        if req.data.arch == ScmpArch::X8664
            && req.data.syscall.as_raw_syscall() & X32_SYSCALL_BIT != 0
        {
            req.data.arch = ScmpArch::X32;
        }

        // Lookup the system call handler, panic if not found.
        let syscall = Sydcall(req.data.syscall, scmp_arch_raw(req.data.arch));
        let handler = if let Some(handler) = self.handlers.get(&syscall) {
            handler
        } else {
            unreachable!("BUG: Missing hook for request {req:?}!");
        };

        let request = UNotifyEventRequest::new(
            req,
            syscall,
            self.fd,
            Arc::clone(&self.cache),
            Arc::clone(&self.sandbox),
            self.crypt_map.as_ref().map(Arc::clone),
            Arc::clone(&self.unix_map),
        );
        let mut response = handler(request);

        // Check for the following pseudo errnos:
        // 1. EIDRM:
        //    a. A previous SECCOMP_IOCTL_NOTIF_ADDFD has replied to the request with
        //       SECCOMP_ADDFD_FLAG_SEND already, no need to return a reply again here.
        //    b. A read-write encryption request was made and the encrypted fd has
        //       already been returned as a reply with SECCOMP_IOCTL_NOTIF_ADDFD with
        //       the flag SECCOMP_ADDFD_FLAG_SEND.
        // 2. EOWNERDEAD: Enter ghost mode.
        let ghost = match response.error {
            EIDRM if response.id == 0 && response.val == 0 && response.flags == 0 => return,
            EOWNERDEAD if response.id == 0 && response.val == 0 && response.flags == 0 => {
                crate::warn!("ctx": "confine", "op": "enter_ghost_mode", "pid": req.pid,
                    "sys": syscall, "arch": SydArch(req.data.arch), "args": req.data.args,
                    "src": proc_mmap(req.pid(), req.data.instr_pointer).ok());

                // Correct the pseudo errno back to success.
                response.error = 0;
                response.val = 0;

                true
            }
            _ => false,
        };

        let response = libc::seccomp_notif_resp {
            id: response.id,
            val: response.val,
            error: response.error,
            flags: response.flags,
        };

        // EINTR is not retried because it may mean child is signaled.
        // ENOENT means child died mid-way.
        // Nothing else we can do on errors here.
        let _ = seccomp_notify_respond(self.fd, std::ptr::addr_of!(response));

        // Finalize ghost mode as necessary.
        if ghost {
            let _ = close(self.fd);

            // Inform the monitor thread and other emulator threads to exit.
            self.should_exit.store(true, Ordering::Relaxed);
        }
    }

    /// Confine Worker thread.
    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        seccomp_fd: RawFd,
        flags: Flags,
        is_crypt: bool,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
    ) -> SydResult<ScmpFilterContext> {
        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_spec_exec:1
        ctx.set_ctl_ssb(flags.allow_unsafe_spec_exec())?;

        // DO NOT synchronize filter to all threads.
        // Main thread will confine itself.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Allow clones without namespace flags.
        confine_scmp_clone(&mut ctx)?;
        // Deny clone3 with ENOSYS for compatibility.
        confine_scmp_clone3(&mut ctx)?;

        // Deny open and {l,}stat with ENOSYS rather than KillProcess.
        // We need this because std::thread::spawn has unwanted
        // side-effects such as opening /sys/devices/system/cpu/online
        // on some architectures.
        for sysname in ["open", "stat", "lstat"] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Errno(Errno::ENOSYS as i32), syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow a restricted set of ioctl(2) operations to our seccomp fd only.
        //
        // Syscall argument cookies for SECCOMP_IOCTL_NOTIF_SEND my be disabled
        // at startup with trace/allow_unsafe_nocookie:1.
        let sysname = "ioctl";
        #[allow(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                #[allow(clippy::useless_conversion)]
                for ioctl_request in SECCOMP_IOCTL_NOTIF_LIST {
                    let mut rules = vec![scmp_cmp!($arg0 == seccomp_fd as u64)];

                    // SAFETY: We protect SECCOMP_IOCTL_NOTIF_ADDFD with system call argument
                    // cookies, to raise the bar against an attacker who has compromised Syd
                    // and aims steal file descriptors.
                    //
                    // Randomizing the seccomp-fd at startup is another mitigation against this.
                    if !flags.allow_unsafe_nocookie() && *ioctl_request == SECCOMP_IOCTL_NOTIF_ADDFD
                    {
                        rules.extend(&[
                            scmp_cmp!($arg3 == (*SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG5).into()),
                        ]);
                    }

                    // SAFETY: We protect SECCOMP_IOCTL_NOTIF_SEND with system call argument
                    // cookies, to raise the bar against an attacker who has compromised Syd
                    // and aims to inject the flag SECCOMP_USER_NOTIF_FLAG_CONTINUE to this
                    // response in order to pass-through a system call to the host Linux kernel.
                    //
                    // Randomizing the seccomp-fd at startup is another mitigation against this.
                    if !flags.allow_unsafe_nocookie() && *ioctl_request == SECCOMP_IOCTL_NOTIF_SEND
                    {
                        rules.extend(&[
                            scmp_cmp!($arg3 == (*SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG5).into()),
                        ]);
                    }

                    rules.push(scmp_cmp!($arg1 == (*ioctl_request).into()));
                    ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;

                    if let Some(ioctl_request) = extend_ioctl((*ioctl_request).into()) {
                        rules.pop();
                        rules.push(scmp_cmp!($arg1 == ioctl_request));
                        ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
                    }
                }
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow unshare(2) with CLONE_FS|CLONE_FILES only.
        let sysname = "unshare";
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                let flags = if is_crypt {
                    libc::CLONE_FS as u64
                } else {
                    (libc::CLONE_FILES | libc::CLONE_FS) as u64
                };
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg0 == flags)])?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Deny pipe2(2) O_NOTIFICATION_PIPE flag.
        let sysname = "pipe2";
        #[allow(clippy::cast_sign_loss)]
        if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
            // O_NOTIFICATION_PIPE is equivalent to O_EXCL,
            // see: linux/watch_queue.h
            const O_NOTIFICATION_PIPE: u64 = OFlag::O_EXCL.bits() as u64;

            ctx.add_rule_conditional(
                ScmpAction::Allow,
                syscall,
                &[scmp_cmp!($arg1 & O_NOTIFICATION_PIPE == 0)],
            )?;
        } else {
            info!("ctx": "confine", "op": "allow_emu_syscall",
                "msg": format!("invalid or unsupported syscall {sysname}"));
        }

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            for op in EMU_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Deny installing new signal handlers for {rt_,}sigaction(2).
        for sysname in ["sigaction", "rt_sigaction"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            // Installs a signal handler if first argument is non-NULL.
            // We deny this case, but allow returning the current handler.
            ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == 0)])?;
        }

        // Allow safe system calls.
        for sysname in EMU_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Syscall argument cookies may be disabled
        // at startup with trace/allow_unsafe_nocookie:1.
        if !flags.allow_unsafe_nocookie() {
            // memfd_create(2) may be used only with syscall argument cookies.
            let sysname = "memfd_create";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg2 == (*MEMFD_CREATE_COOKIE_ARG2).into()),
                            scmp_cmp!($arg3 == (*MEMFD_CREATE_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*MEMFD_CREATE_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*MEMFD_CREATE_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // renameat2(2) may be used only with syscall argument cookies.
            // We also prevent AT_FDCWD usage as fd argument.
            let sysname = "renameat2";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg0 <= RawFd::MAX as u64),
                            scmp_cmp!($arg5 == (*RENAMEAT2_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // {,f}truncate{,64}(2) may be used only with syscall argument cookies.
            let sysname = "truncate";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg2 == (*TRUNCATE_COOKIE_ARG2).into()),
                            scmp_cmp!($arg3 == (*TRUNCATE_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*TRUNCATE_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*TRUNCATE_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            let sysname = "truncate64";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            #[cfg(target_arch = "x86")]
                            scmp_cmp!($arg3 == (*TRUNCATE64_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*TRUNCATE64_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*TRUNCATE64_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            let sysname = "ftruncate";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg2 == (*FTRUNCATE_COOKIE_ARG2).into()),
                            scmp_cmp!($arg3 == (*FTRUNCATE_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*FTRUNCATE_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*FTRUNCATE_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            let sysname = "ftruncate64";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            #[cfg(target_arch = "x86")]
                            scmp_cmp!($arg3 == (*FTRUNCATE64_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*FTRUNCATE64_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*FTRUNCATE64_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // linkat(2) may be used only with syscall argument cookies.
            // We also enforce PROC_FD usage as fd argument.
            // This fd is randomized so it further raises the bar.
            let sysname = "linkat";
            #[allow(clippy::cast_sign_loss)]
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg0 == PROC_FD() as u64),
                            scmp_cmp!($arg5 == (*LINKAT_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // unlinkat(2) may be used only with syscall argument cookies.
            // We also prevent AT_FDCWD usage as fd argument.
            let sysname = "unlinkat";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg0 <= RawFd::MAX as u64),
                            scmp_cmp!($arg3 == (*UNLINKAT_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*UNLINKAT_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*UNLINKAT_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // openat2(2) may be used only with syscall argument cookies.
            // We also prevent AT_FDCWD usage as fd argument.
            let sysname = "openat2";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg0 <= RawFd::MAX as u64),
                            scmp_cmp!($arg4 == (*OPENAT2_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*OPENAT2_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        } else {
            // trace/allow_unsafe_nocookie: Allow access without cookies.
            for sysname in [
                "memfd_create",
                "renameat2",
                "truncate",
                "truncate64",
                "ftruncate",
                "ftruncate64",
                "linkat",
                "unlinkat",
                "openat2",
            ] {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }
        }

        // openat(2) may be used to open the parent directory only by getdir_long().
        // The rest of the attempts are denied with ENOSYS for compat.
        let sysname = "openat";
        #[allow(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                let dotdot = dotdot_with_nul();
                let oflags = (libc::O_RDONLY
                    | libc::O_CLOEXEC
                    | libc::O_DIRECTORY
                    | libc::O_LARGEFILE
                    | libc::O_NOCTTY
                    | libc::O_NOFOLLOW) as u64;
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 <= RawFd::MAX as u64),
                        scmp_cmp!($arg1 == dotdot),
                        scmp_cmp!($arg2 & oflags == oflags),
                    ],
                )?;
                ctx.add_rule_conditional(
                    ScmpAction::Errno(Errno::ENOSYS as i32),
                    syscall,
                    &[scmp_cmp!($arg0 > RawFd::MAX as u64)],
                )?;
                ctx.add_rule_conditional(
                    ScmpAction::Errno(Errno::ENOSYS as i32),
                    syscall,
                    &[scmp_cmp!($arg1 != dotdot)],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // socket(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[allow(clippy::cast_possible_truncation)]
        #[allow(clippy::cast_sign_loss)]
        #[allow(clippy::useless_conversion)]
        #[allow(deprecated)]
        if let Some(syscall) = SYS_SOCKET.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Deny based on socket type.
            if !flags.allow_unsafe_socket() {
                for ty in [SOCK_RAW as u64, SOCK_PACKET as u64] {
                    ctx.add_rule_conditional(
                        ScmpAction::Errno(EACCES),
                        syscall,
                        &[scmp_cmp!($arg1 == ty)],
                    )?;
                }
            }

            // Allow only specified socket domains.
            let domains = if !flags.allow_unsupp_socket() {
                let mut domains = vec![AF_UNIX, AF_INET, AF_INET6, AF_NETLINK];
                if flags.allow_safe_kcapi() {
                    domains.push(AF_ALG);
                }
                Some(domains)
            } else {
                None
            };

            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if !flags.allow_unsafe_nocookie() {
                rules.extend(&[
                    scmp_cmp!($arg3 == (*SOCKET_COOKIE_ARG3).into()),
                    scmp_cmp!($arg4 == (*SOCKET_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*SOCKET_COOKIE_ARG5).into()),
                ]);
            }

            if let Some(domains) = domains {
                for domain in domains {
                    rules.push(scmp_cmp!($arg0 == domain as u64));
                    ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
                    rules.pop();
                }
            } else if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("socket") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall socket");
                }
            }
        }

        // accept4(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[allow(clippy::cast_possible_truncation)]
        #[allow(clippy::cast_sign_loss)]
        #[allow(clippy::useless_conversion)]
        #[allow(deprecated)]
        if let Some(syscall) = SYS_ACCEPT4.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if !flags.allow_unsafe_nocookie() {
                rules.extend(&[
                    scmp_cmp!($arg4 == (*ACCEPT4_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*ACCEPT4_COOKIE_ARG5).into()),
                ]);
            }

            if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("accept4") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall accept4");
                }
            }
        }

        // bind(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[allow(clippy::cast_possible_truncation)]
        #[allow(clippy::cast_sign_loss)]
        #[allow(clippy::useless_conversion)]
        #[allow(deprecated)]
        if let Some(syscall) = SYS_BIND.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if !flags.allow_unsafe_nocookie() {
                rules.extend(&[
                    scmp_cmp!($arg3 == (*BIND_COOKIE_ARG3).into()),
                    scmp_cmp!($arg4 == (*BIND_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*BIND_COOKIE_ARG5).into()),
                ]);
            }

            if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("bind") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall bind");
                }
            }
        }

        // connect(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[allow(clippy::cast_possible_truncation)]
        #[allow(clippy::cast_sign_loss)]
        #[allow(clippy::useless_conversion)]
        #[allow(deprecated)]
        if let Some(syscall) = SYS_CONNECT.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if !flags.allow_unsafe_nocookie() {
                rules.extend(&[
                    scmp_cmp!($arg3 == (*CONNECT_COOKIE_ARG3).into()),
                    scmp_cmp!($arg4 == (*CONNECT_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*CONNECT_COOKIE_ARG5).into()),
                ]);
            }

            if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("connect") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall connect");
                }
            }
        }

        // Allow futex system calls.
        for sysname in FUTEX_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow getid system calls.
        for sysname in GET_ID_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow execveat(2) with AT_EXECVE_CHECK for Linux>=6.14.
        if *HAVE_AT_EXECVE_CHECK {
            let sysname = "execveat";
            #[allow(clippy::cast_sign_loss)]
            let atcheck = AT_EXECVE_CHECK.bits() as u64;
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[scmp_cmp!($arg4 & atcheck == atcheck)],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = flags.allow_safe_setuid();
        let safe_setgid = flags.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            scmp_add_setid_rules(
                "emu",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;

            // SAFETY:
            // 1. cap{g,s}et is necessary to drop CAP_SET{U,G}ID after changing {U,G}ID.
            // 2. Signal system calls are necessary to handle reserved signals.
            // Note, {rt_,}sigreturn is already allowed for emulators to handle SIGALRM.
            for sysname in ["capget", "capset", "sigaction", "rt_sigaction"] {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_emu_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }
        }

        Ok(ctx)
    }
}

/// Type that exists to manage worker exit on panic.
///
/// This type is constructed once per `Worker` and implements `Drop` to
/// handle proper worker exit in case the worker panics when executing
/// the current task or anywhere else in its work loop. If the
/// `Sentinel` is dropped at the end of the worker's work loop and the
/// current thread is panicking, handle worker exit the same way as if
/// the task completed normally (if the worker panicked while executing
/// a submitted task) then clone the worker and start it with an initial
/// task of `None`.
struct Sentinel<'a> {
    seccomp_id: Option<u64>,
    worker_ref: &'a Worker,
}

impl<'a> Sentinel<'a> {
    fn new(worker_ref: &'a Worker) -> Sentinel<'a> {
        Self {
            seccomp_id: None,
            worker_ref,
        }
    }

    #[allow(clippy::arithmetic_side_effects)]
    fn deny_syscall(&self, seccomp_id: u64, errno: Errno) {
        let response = libc::seccomp_notif_resp {
            id: seccomp_id,
            val: 0,
            error: -(errno as i32),
            flags: 0,
        };

        // EAGAIN|EINTR is retried.
        // ENOENT means child died mid-way.
        // Nothing else we can do on errors here.
        let _ = seccomp_notify_respond(self.worker_ref.fd, std::ptr::addr_of!(response));
    }
}

impl Drop for Sentinel<'_> {
    fn drop(&mut self) {
        if thread::panicking() {
            if let Some(seccomp_id) = self.seccomp_id {
                // Busy thread panicked.
                // SAFETY: Deny syscall in progress!
                self.deny_syscall(seccomp_id, Errno::EACCES);
                self.worker_ref.worker_data.decrement_both();
            } else {
                // Idle thread panicked.
                self.worker_ref.worker_data.decrement_worker_total();
            }
        }
    }
}
