//
// Syd: rock-solid application kernel
// src/workers/aes.rs: `syd_aes' encryption thread
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

// SAFETY: This module has (almost) been liberated from unsafe code!
// Owner::from_raw_fd is used for crypt_fd which is unsafe.
// Use deny rather than forbid so we can allow this case.
#![deny(unsafe_code)]

use std::{
    collections::hash_map::Entry,
    env, fmt,
    os::fd::{AsFd, FromRawFd, OwnedFd, RawFd},
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc, Condvar, Mutex,
    },
    thread,
};

use libseccomp::{scmp_cmp, RawSyscall, ScmpAction, ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    fcntl::{posix_fadvise, splice, tee, OFlag, PosixFadviseAdvice, SpliceFFlags},
    unistd::{lseek64, write, Gid, Uid, Whence},
};
use serde::{Serialize, Serializer};

#[cfg(target_arch = "x86")]
use crate::cookie::FTRUNCATE64_COOKIE_ARG3;
use crate::{
    config::*,
    confine::{
        confine_scmp_clone, confine_scmp_clone3, confine_scmp_fadvise, confine_scmp_madvise,
        confine_scmp_write, scmp_add_setid_rules, ExportMode,
    },
    cookie::{
        safe_ftruncate64, safe_pipe2, ACCEPT4_COOKIE_ARG4, ACCEPT4_COOKIE_ARG5,
        FTRUNCATE64_COOKIE_ARG4, FTRUNCATE64_COOKIE_ARG5, FTRUNCATE_COOKIE_ARG2,
        FTRUNCATE_COOKIE_ARG3, FTRUNCATE_COOKIE_ARG4, FTRUNCATE_COOKIE_ARG5, PIPE2_COOKIE_ARG2,
        PIPE2_COOKIE_ARG3, PIPE2_COOKIE_ARG4, PIPE2_COOKIE_ARG5, SYS_ACCEPT4,
    },
    debug,
    err::{err2no, SydJoinHandle, SydResult},
    error,
    fs::{seal_memfd, FileInfo},
    hash::{
        aes_ctr_enc, aes_ctr_init, hmac_sha256_feed, hmac_sha256_fini, hmac_sha256_init,
        SydHashMap, BLOCK_SIZE, HMAC_TAG_SIZE, IV, IV_SIZE, SYD3_HDR_SIZE,
    },
    info,
    ofd::lock_fd,
    path::{XPath, XPathBuf},
    retry::{retry_on_eintr, retry_on_intr},
    sandbox::Flags,
};

#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub(crate) enum AesMod {
    Read,
    Append,
    Write,
}

impl fmt::Display for AesMod {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Read => write!(f, "read"),
            Self::Append => write!(f, "append"),
            Self::Write => write!(f, "write"),
        }
    }
}

impl Serialize for AesMod {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        serializer.serialize_str(&self.to_string())
    }
}

impl From<OFlag> for AesMod {
    fn from(flags: OFlag) -> Self {
        if flags.contains(OFlag::O_WRONLY) || flags.contains(OFlag::O_RDWR) {
            if flags.contains(OFlag::O_APPEND) {
                Self::Append
            } else {
                Self::Write
            }
        } else {
            Self::Read
        }
    }
}

#[derive(Clone)]
pub(crate) struct AesVal {
    pub(crate) crypt_fd: RawFd,
    pub(crate) enc_fd: RawFd,
    pub(crate) iv: Option<IV>, // None means encryption in progress.
    pub(crate) info: FileInfo,
    pub(crate) mode: AesMod,
}

pub(crate) type AesMap = SydHashMap<XPathBuf, AesVal>;
pub(crate) type AesLock = Arc<(Mutex<AesMap>, Condvar)>;

#[derive(Clone)]
pub(crate) struct AesWorker {
    fdalg: (RawFd, RawFd),
    files: AesLock,
    flags: Flags,
    is_memfd: bool,
    should_exit: Arc<AtomicBool>,
    transit_uids: Vec<(Uid, Uid)>,
    transit_gids: Vec<(Gid, Gid)>,
}

// Common options for functions:
// - AesWorker::hmac_prefeed
// - AesWorker::write_crypt_header
// - AesWorker::aes_feed
// - AesWorker::hmac_write
#[derive(Clone, Copy, Debug)]
struct AesWorkerOpts<'a> {
    crypt_path: &'a XPath,
    memfd: bool,
    file_mode: AesMod,
    last_block_offset: u64,
}

impl AesWorker {
    pub(crate) fn new(
        fdalg: (RawFd, RawFd),
        files: AesLock,
        flags: Flags,
        is_memfd: bool,
        should_exit: Arc<AtomicBool>,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
    ) -> Self {
        Self {
            fdalg,
            files,
            flags,
            is_memfd,
            should_exit,
            transit_uids: transit_uids.to_vec(),
            transit_gids: transit_gids.to_vec(),
        }
    }

    /// Confine AES thread.
    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        flags: Flags,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
    ) -> SydResult<ScmpFilterContext> {
        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(flags.allow_unsafe_exec_speculative())?;

        // DO NOT synchronize filter to all threads.
        // Main thread will confine itself.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Deny open and stat family with ENOSYS rather than KillProcess.
        // We need this because std::thread::spawn has unwanted
        // side-effects such as opening /sys/devices/system/cpu/online
        // on some architectures.
        //
        // Note, we avoid this when profiling is enabled,
        // as gperf requires it to write profiling data.
        for sysname in [
            "open",
            "openat",
            "openat2",
            "stat",
            "lstat",
            "statx",
            "newfstatat",
        ] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    let action = if !cfg!(feature = "prof") {
                        ScmpAction::Errno(Errno::ENOSYS as i32)
                    } else {
                        ScmpAction::Allow
                    };
                    ctx.add_rule(action, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_aes_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow reads up to MAX(HMAC | IV) bytes.
        // See the note in sync_file().
        //
        // IMPORTANT: Because of this rule, log invocations
        // after sandboxing must NOT include the `pid` key
        // which will cause an immediate Syd crash with an
        // audit log entry.
        let rw_max: u64 = HMAC_TAG_SIZE
            .max(IV_SIZE)
            .try_into()
            .or(Err(Errno::EOVERFLOW))?;
        match ScmpSyscall::from_name("read") {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[scmp_cmp!($arg2 <= rw_max)],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_aes_syscall",
                    "msg": "invalid or unsupported syscall read");
            }
        }

        // Allow writes to log-fd OR up to MAX(HMAC | IV) bytes.
        // No proc_pid_mem(5) access required here.
        confine_scmp_write(&mut ctx, Some(rw_max), false)?;

        // Allow clones without namespace flags.
        confine_scmp_clone(&mut ctx)?;
        // Deny clone3 with ENOSYS for compatibility.
        confine_scmp_clone3(&mut ctx)?;

        // Allow fadvise family syscalls.
        confine_scmp_fadvise(&mut ctx)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // ftruncate{,64}(2) may be used only with syscall argument cookies.
        let sysname = "ftruncate";
        #[expect(clippy::useless_conversion)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                // Secure using syscall argument cookies.
                let mut rules = vec![];
                if !flags.allow_unsafe_nocookie() {
                    rules.extend(&[
                        scmp_cmp!($arg2 == (*FTRUNCATE_COOKIE_ARG2).into()),
                        scmp_cmp!($arg3 == (*FTRUNCATE_COOKIE_ARG3).into()),
                        scmp_cmp!($arg4 == (*FTRUNCATE_COOKIE_ARG4).into()),
                        scmp_cmp!($arg5 == (*FTRUNCATE_COOKIE_ARG5).into()),
                    ]);
                }

                if rules.is_empty() {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                } else {
                    ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
                }
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        let sysname = "ftruncate64";
        #[expect(clippy::useless_conversion)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                // Secure using syscall argument cookies.
                let mut rules = vec![];
                if !flags.allow_unsafe_nocookie() {
                    rules.extend(&[
                        #[cfg(target_arch = "x86")]
                        scmp_cmp!($arg3 == (*FTRUNCATE64_COOKIE_ARG3).into()),
                        scmp_cmp!($arg4 == (*FTRUNCATE64_COOKIE_ARG4).into()),
                        scmp_cmp!($arg5 == (*FTRUNCATE64_COOKIE_ARG5).into()),
                    ]);
                }

                if rules.is_empty() {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                } else {
                    ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
                }
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // accept4(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::useless_conversion)]
        if let Some(syscall) = SYS_ACCEPT4.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if !flags.allow_unsafe_nocookie() {
                rules.extend(&[
                    scmp_cmp!($arg4 == (*ACCEPT4_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*ACCEPT4_COOKIE_ARG5).into()),
                ]);
            }

            if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("accept4") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall accept4");
                }
            }
        }

        // Restrict pipe2(2) flags.
        // Restrict pipe2(2) using syscall argument cookies.
        let sysname = "pipe2";
        #[expect(clippy::cast_sign_loss)]
        #[expect(clippy::useless_conversion)]
        if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
            let mut rules = vec![scmp_cmp!($arg1 == OFlag::O_CLOEXEC.bits() as u64)];
            if !flags.allow_unsafe_nocookie() {
                rules.extend(&[
                    scmp_cmp!($arg2 == (*PIPE2_COOKIE_ARG2).into()),
                    scmp_cmp!($arg3 == (*PIPE2_COOKIE_ARG3).into()),
                    scmp_cmp!($arg4 == (*PIPE2_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*PIPE2_COOKIE_ARG5).into()),
                ]);
            }

            ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
        } else {
            info!("ctx": "confine", "op": "allow_aes_syscall",
                "msg": format!("invalid or unsupported syscall {sysname}"));
        }

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_aes_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            for op in AES_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Allow safe prctl(2) operations.
        let sysname = "prctl";
        if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
            for (_, op) in AES_PRCTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg0 == *op)])?;
            }
        } else {
            info!("ctx": "confine", "op": "allow_aes_syscall",
                "msg": format!("invalid or unsupported syscall {sysname}"));
        }

        // Deny installing new signal handlers for {rt_,}sigaction(2).
        for sysname in ["sigaction", "rt_sigaction"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    crate::info!("ctx": "confine", "op": "allow_main_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            // Installs a signal handler if first argument is non-NULL.
            // We deny this case, but allow returning the current handler.
            ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == 0)])?;
        }

        // Allow safe system calls.
        for sysname in AES_SYSCALLS.iter().chain(VDSO_SYSCALLS) {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_aes_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow futex system calls.
        for sysname in FUTEX_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_aes_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow getid system calls.
        for sysname in GET_ID_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_aes_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = flags.allow_safe_setuid();
        let safe_setgid = flags.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            scmp_add_setid_rules(
                "aes",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;
        }

        Ok(ctx)
    }

    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self) -> Result<SydJoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_aes".to_string())
            .stack_size(AES_STACK_SIZE)
            .spawn(move || {
                // Honour dry-run when exporting.
                let dry_run =
                    env::var_os(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();

                if !dry_run {
                    // Load the filter immediately.
                    // Logging is permitted as long as the `pid` key is unused.
                    // See prepare_confine for more information.
                    let ctx =
                        Self::prepare_confine(self.flags, &self.transit_uids, &self.transit_gids)?;
                    ctx.load()?;

                    let safe_setid = self
                        .flags
                        .intersects(Flags::FL_ALLOW_SAFE_SETUID | Flags::FL_ALLOW_SAFE_SETGID);
                    info!("ctx": "confine", "op": "confine_aes_thread",
                        "msg": format!("AES thread confined with{} SROP mitigation",
                            if safe_setid { "out" } else { "" }));
                } else {
                    error!("ctx": "confine", "op": "confine_aes_thread",
                        "msg": "AES threads are running unconfined in debug mode");
                }

                // Enter main loop.
                Self::main(self.fdalg, self.files, self.is_memfd, self.should_exit)
            })
            .map_err(|err| err2no(&err))
    }

    fn main(
        fdalg: (RawFd, RawFd),
        files: AesLock,
        is_memfd: bool,
        should_exit: Arc<AtomicBool>,
    ) -> SydResult<()> {
        let (aes_map, cvar) = &*files;
        let mut batches = Vec::new();
        let mut threads: Vec<Option<SydJoinHandle<()>>> = Vec::with_capacity(*NPROC);
        loop {
            // Check if there're any pending encryption requests.
            let mut aes_map = aes_map.lock().unwrap_or_else(|e| e.into_inner());
            while aes_map.is_empty() && !should_exit.load(Ordering::Relaxed) {
                aes_map = cvar.wait(aes_map).unwrap_or_else(|e| e.into_inner());
            }

            // Exit only when there's no pending work.
            if aes_map.is_empty() && should_exit.load(Ordering::Relaxed) {
                break;
            }

            for (crypt_path, crypt_data) in aes_map.iter_mut() {
                if let Some(iv) = crypt_data.iv.take() {
                    let crypt_data = AesVal {
                        iv: Some(iv),
                        ..*crypt_data
                    };
                    batches.push((crypt_path.clone(), crypt_data));
                }
            }
            drop(aes_map); // Release the lock.

            // Join finished threads.
            threads.retain_mut(|thread| {
                if thread.as_ref().is_some_and(|t| t.is_finished()) {
                    if let Some(thread) = thread.take() {
                        let _ = thread.join();
                    }
                    false // remove
                } else {
                    true // retain
                }
            });

            // Spawn threads to handle pending encryption requests.
            for (crypt_path, crypt_data) in batches.drain(..) {
                threads.push(Some(Self::spawn(
                    fdalg,
                    &files,
                    &crypt_path,
                    crypt_data,
                    is_memfd,
                )?));
            }
        }

        // Wait for the ongoing encryption operations before exiting.
        for thread in threads.into_iter().flatten() {
            let _ = thread.join();
        }

        Ok(())
    }

    fn spawn(
        fdalg: (RawFd, RawFd),
        files: &AesLock,
        crypt_path: &XPath,
        crypt_data: AesVal,
        memfd: bool,
    ) -> SydResult<SydJoinHandle<()>> {
        let handle = retry_on_intr(|| {
            let files = Arc::clone(files);
            let crypt_data = crypt_data.clone();
            let crypt_path = crypt_path.to_owned();

            thread::Builder::new()
                .name("syd_aes".into())
                .stack_size(AES_STACK_SIZE)
                .spawn(move || {
                    // SAFETY: crypt_map keys are valid FDs.
                    #[expect(unsafe_code)]
                    let crypt_fd = unsafe { OwnedFd::from_raw_fd(crypt_data.crypt_fd) };

                    // Wait until we take a write lock on the encrypted fd.
                    // This will succeed once all fds owned by the sandbox
                    // process are closed.
                    retry_on_eintr(|| lock_fd(&crypt_fd, true, true))?;
                    debug!("ctx": "aes", "op": "start_sync",
                        "path": &crypt_path, "memfd": memfd,
                        "msg": format!("plaintext {} is locked for write, sync to `{crypt_path}' started",
                            if memfd { "memfd" } else { "file" }));

                    // All good, sync contents to disk.
                    let result = Self::sync(fdalg, &crypt_fd, &crypt_path, crypt_data, memfd);

                    // Safe to remove file entry now.
                    {
                        let (aes_map, _cvar) = &*files;
                        let mut aes_map = aes_map.lock().unwrap_or_else(|e| e.into_inner());
                        if let Entry::Occupied(entry) = aes_map.entry(crypt_path) {
                            if entry.get().iv.is_none() {
                                entry.remove();
                            }
                        }
                    } // Lock is released here.

                    // Close the encrypted FD.
                    drop(crypt_fd);

                    result
                })
                .map_err(|err| err2no(&err))
        })?;

        Ok(handle)
    }

    #[expect(clippy::cognitive_complexity)]
    fn sync<Fd: AsFd>(
        fdalg: (RawFd, RawFd),
        crypt_fd: Fd,
        crypt_path: &XPath,
        crypt_data: AesVal,
        memfd: bool,
    ) -> SydResult<()> {
        // Seal memfd to ensure no further writes happen.
        if memfd {
            seal_memfd(&crypt_fd)?;
            debug!("ctx": "aes", "op": "seal_memfd",
                "path": crypt_path, "memfd": memfd,
                "msg": "sealed plaintext memfd to prevent seals, writes, shrinks and grows");
        }

        let (aes_fd, mac_fd) = fdalg;
        let file_mode = crypt_data.mode;
        // syd_aes thread steals the IV,
        // therefore it is always Some.
        #[expect(clippy::disallowed_methods)]
        let mut iv = crypt_data.iv.unwrap();

        // SAFETY: crypt_data.enc_fd is a valid FD.
        #[expect(unsafe_code)]
        let enc_fd = unsafe { OwnedFd::from_raw_fd(crypt_data.enc_fd) };
        debug!("ctx": "aes", "op": "encrypt_start",
            "path": crypt_path, "memfd": memfd, "mode": file_mode,
            "msg": format!("start encrypting plaintext {} into `{crypt_path}' with mode {file_mode}",
                if memfd { "memfd" } else { "file" }));

        // Nothing to do if file was readonly.
        let mut is_append = match file_mode {
            AesMod::Read => {
                debug!("ctx": "aes", "op": "encrypt_noop",
                    "path": crypt_path, "memfd": memfd, "mode": file_mode,
                    "msg": "sync done: plaintext was read but never written");
                return Ok(());
            }
            AesMod::Append => true,
            _ => false,
        };

        // Handle truncation quickly.
        let data_size: u64 = lseek64(&crypt_fd, 0, Whence::SeekEnd)?
            .try_into()
            .or(Err(Errno::EOVERFLOW))?;
        if data_size == 0 {
            retry_on_eintr(|| safe_ftruncate64(&enc_fd, 0))?;
            debug!("ctx": "aes", "op": "encrypt_zero",
                "path": crypt_path, "memfd": memfd, "mode": file_mode,
                "msg": "sync done: plaintext was truncated to zero");
            return Ok(());
        }

        // Handle opened for append but encrypted file is new.
        let mut file_size: u64 = lseek64(&enc_fd, 0, Whence::SeekEnd)?
            .try_into()
            .or(Err(Errno::EOVERFLOW))?;
        if is_append && file_size == 0 {
            is_append = false;
        }

        // Handle opened for append but appended nothing quickly.
        #[expect(clippy::arithmetic_side_effects)]
        if is_append
            && data_size
                <= file_size.saturating_sub((CRYPT_MAGIC.len() + HMAC_TAG_SIZE + IV_SIZE) as u64)
        {
            debug!("ctx": "aes", "op": "append_noop",
                "path": crypt_path, "memfd": memfd, "mode": file_mode,
                "data_size": data_size, "file_size": file_size,
                "msg": "sync done: plaintext was never appended");
            return Ok(());
        }

        // We handled quick cases, before possibly
        // truncating the encrypted file, let's
        // ensure we open the connections as expected,
        // and use posix_fadvise(2) to hint the kernel
        // about I/O access patterns.

        // Initialize HMAC socket and feed magic header and IV.
        let sock_mac = hmac_sha256_init(&mac_fd, false)?;
        hmac_sha256_feed(&sock_mac, CRYPT_MAGIC, true)?;
        hmac_sha256_feed(&sock_mac, iv.as_ref(), true)?;
        let (pipe_rd_mac, pipe_wr_mac) = safe_pipe2(OFlag::O_CLOEXEC)?;

        // Hint the kernel about I/O access patterns.
        Self::advise_io(&crypt_fd, &enc_fd)?;

        // Handle last block re-encryption for append.
        if is_append {
            // Adjust file_size to exclude the header.
            file_size = file_size
                .checked_sub(SYD3_HDR_SIZE)
                .ok_or(Errno::EOVERFLOW)?;

            // Get offset of the last full block.
            let last_block_offset = Self::get_last_block_offset(file_size)?;

            // Adjust the IV counter based on the last full block offset.
            iv.add_counter(last_block_offset);

            // Position crypt_fd offset for append.
            Self::seek2append(&crypt_fd, last_block_offset, file_size)?;

            // SAFETY: Ensure no stale bytes from the last partial block survive.
            // Let L = file_size (header excluded), B = BLOCK_SIZE, r = L % B, k = L - r.
            // We write exactly (data_size - k) bytes, with data_size >= L by definition
            // of append (or equal on degenerate no-op paths which we early-exit).
            // Then (data_size - k) >= (L - k) = r, so the overwrite spans the entire
            // old tail and extends by (data_size - L). No truncate required.
            // Enforce this invariant with an assert so regressions are caught.
            assert!(
                data_size >= file_size,
                "BUG: append invariant violated: data-size:{data_size:#x} < file-size:{file_size:#x}; report a bug!",
            );

            let options = AesWorkerOpts {
                memfd,
                file_mode,
                crypt_path,
                last_block_offset,
            };

            // Feed kept ciphertext prefix into HMAC.
            Self::hmac_prefeed(&enc_fd, &sock_mac, &pipe_rd_mac, &pipe_wr_mac, options)?;
        } else {
            // Non-append mode: overwrite the file.

            // Reset crypt_fd to the beginning.
            lseek64(&crypt_fd, 0, Whence::SeekSet)?;

            if file_size > 0 {
                // Remove previous content,
                // SAFETY: wipe IV to avoid reuse.
                retry_on_eintr(|| safe_ftruncate64(&enc_fd, 0))?;
                lseek64(&enc_fd, 0, Whence::SeekSet)?;
            }

            let options = AesWorkerOpts {
                memfd,
                file_mode,
                crypt_path,
                last_block_offset: 0, // unused.
            };

            // Write encrypted file header with the given IV.
            Self::write_crypt_header(&enc_fd, &iv, options)?;
        }

        // Initialize encryption socket, and set IV.
        let sock_enc = aes_ctr_init(&aes_fd, false)?;
        aes_ctr_enc(&sock_enc, &[], Some(&iv), true)?;

        // The IV is no longer needed.
        drop(iv);

        let (pipe_rd_enc, pipe_wr_enc) = safe_pipe2(OFlag::O_CLOEXEC)?;

        let options = AesWorkerOpts {
            memfd,
            file_mode,
            crypt_path,
            last_block_offset: 0, // unused.
        };

        // Feed plaintext into AES & HMAC algorithm sockets.
        Self::aes_feed(
            (&crypt_fd, &enc_fd),
            (&sock_enc, &sock_mac),
            (&pipe_rd_enc, &pipe_wr_enc, &pipe_rd_mac, &pipe_wr_mac),
            options,
        )?;

        // Write HMAC tag to the encrypted file.
        Self::hmac_write(&enc_fd, &sock_mac, options)?;

        // All good, farewell to all OwnedFds!
        debug!("ctx": "aes", "op": "encrypt_done",
            "path": crypt_path, "memfd": memfd, "mode": file_mode,
            "msg": format!("sync done: unlock `{crypt_path}' on close"));
        Ok(())
    }

    /// Hint the kernel about I/O access patterns.
    fn advise_io<Fd1: AsFd, Fd2: AsFd>(crypt_fd: Fd1, enc_fd: Fd2) -> Result<(), Errno> {
        // Mark enc_fd as SEQUENTIAL before writes to encourage clustered
        // I/O and reduce random writeback patterns.
        posix_fadvise(&enc_fd, 0, 0, PosixFadviseAdvice::POSIX_FADV_SEQUENTIAL)?;

        // Set crypt_fd SEQUENTIAL|WILLNEED before the main pump to
        // prime readahead and avoid small read bursts starving the
        // AF_ALG pipeline.
        posix_fadvise(&crypt_fd, 0, 0, PosixFadviseAdvice::POSIX_FADV_SEQUENTIAL)?;
        posix_fadvise(&crypt_fd, 0, 0, PosixFadviseAdvice::POSIX_FADV_WILLNEED)?;

        Ok(())
    }

    /// Returns the offset of the last full block.
    ///
    /// # Arguments
    ///
    /// - file_size: file size without the header size.
    fn get_last_block_offset(file_size: u64) -> Result<u64, Errno> {
        // Calculate the offset of the last full block.
        let remainder = file_size
            .checked_rem(BLOCK_SIZE as u64)
            .ok_or(Errno::EOVERFLOW)?;
        file_size.checked_sub(remainder).ok_or(Errno::EOVERFLOW)
    }

    /// Positions the plaintext fd offset for append.
    ///
    /// # Arguments
    ///
    /// - last_block_offset: offset of the last full block.
    /// - file_size: file size without the header size.
    fn seek2append<Fd: AsFd>(
        crypt_fd: Fd,
        last_block_offset: u64,
        file_size: u64,
    ) -> Result<(), Errno> {
        // If there is a partial block at the end, we need to re-encrypt it.
        if last_block_offset < file_size {
            // Adjust crypt_fd to read from the last full block offset.
            let off: i64 = last_block_offset.try_into().or(Err(Errno::EOVERFLOW))?;
            lseek64(crypt_fd, off, Whence::SeekSet)
        } else {
            // No partial block, start reading from the current file size.
            let off: i64 = file_size.try_into().or(Err(Errno::EOVERFLOW))?;
            lseek64(crypt_fd, off, Whence::SeekSet)
        }
        .map(drop)
    }

    /// Read from the encrypted file starting after the header.
    ///
    /// Feed only the kept ciphertext prefix into HMAC, i.e.
    /// the range [header .. header + last_block_offset).
    /// Do not read to EOF, stale bytes from last partial block
    /// will be overwritten afterwards.
    fn hmac_prefeed<Fd1: AsFd, Fd2: AsFd, Fd3: AsFd, Fd4: AsFd>(
        enc_fd: Fd1,
        sock_mac: Fd2,
        pipe_rd_mac: Fd3,
        pipe_wr_mac: Fd4,
        options: AesWorkerOpts,
    ) -> Result<(), Errno> {
        let memfd = options.memfd;
        let file_mode = options.file_mode;
        let crypt_path = options.crypt_path;
        let last_block_offset = options.last_block_offset;
        debug!("ctx": "aes", "op": "authenticate_text",
            "path": crypt_path, "memfd": memfd, "mode": file_mode,
            "msg": "feed ciphertext into HMAC algorithm socket");

        #[expect(clippy::cast_possible_wrap)]
        lseek64(&enc_fd, SYD3_HDR_SIZE as i64, Whence::SeekSet)?;

        let mut remain: usize = last_block_offset.try_into().or(Err(Errno::EOVERFLOW))?;
        while remain > 0 {
            let n = retry_on_eintr(|| {
                splice(
                    &enc_fd,
                    None,
                    &pipe_wr_mac,
                    None,
                    remain.min(PIPE_BUF_ALG),
                    SpliceFFlags::empty(),
                )
            })?;
            if n == 0 {
                // splice(2) returned 0 before 'remain' reached 0. This indicates
                // unexpected EOF on enc_fd (e.g., concurrent shrink/truncation) or a
                // broken pipe path. We cannot complete the bounded pre-feed safely.
                // Treat as an I/O error.
                return Err(Errno::EIO);
            }

            let mut ncopy = n;
            while ncopy > 0 {
                let n = retry_on_eintr(|| {
                    splice(
                        &pipe_rd_mac,
                        None,
                        &sock_mac,
                        None,
                        ncopy,
                        SpliceFFlags::SPLICE_F_MORE,
                    )
                })?;
                if n == 0 {
                    return Err(Errno::EBADMSG);
                }
                ncopy = ncopy.checked_sub(n).ok_or(Errno::EOVERFLOW)?;
            }

            // Subtract n bytes from remaining bytes to read.
            remain = remain.checked_sub(n).ok_or(Errno::EOVERFLOW)?;
        }

        Ok(())
    }

    /// Write encrypted file header with the given `IV`.
    fn write_crypt_header<Fd: AsFd>(
        enc_fd: Fd,
        iv: &IV,
        options: AesWorkerOpts,
    ) -> Result<(), Errno> {
        let memfd = options.memfd;
        let file_mode = options.file_mode;
        let crypt_path = options.crypt_path;
        debug!("ctx": "aes", "op": "write_magic",
            "path": crypt_path, "memfd": memfd, "mode": file_mode,
            "msg": "truncated file to write magic and IV");

        // Write file magic and IV to the beginning of the file.
        // Leave gap for HMAC to write later.
        // SAFETY: We need the write(2) system call to write file
        // magic, HMAC and IV to the file so our seccomp filter
        // unfortunately allows it. We do our best by only allowing
        // writes up the HMAC size, which is 32 bytes. Arguably,
        // pulling a BROP with only 32 bytes of buffer-space allowed
        // to transfer the binary over a socket would be really
        // tedious.
        // Alternatively writing the HMAC & IV to xattrs would be a
        // dangerous (think backups stripping xattrs), and
        // relatively less portable workaround.
        let buf = &CRYPT_MAGIC;
        let mut nwrite = 0;
        while nwrite < buf.len() {
            #[expect(clippy::arithmetic_side_effects)]
            match write(&enc_fd, &buf[nwrite..]) {
                Ok(0) => return Err(Errno::EINVAL),
                Ok(n) => nwrite += n,
                Err(Errno::EINTR) => continue,
                Err(errno) => return Err(errno),
            }
        }

        // Move the file offset forward by HMAC_TAG_SIZE to leave
        // space for the HMAC tag. This space is going to be a
        // hole until we write back at the end, see lseek(2).
        // lseek64(enc_fd.as_raw_fd(), HMAC_TAG_SIZE as i64, Whence::SeekCur)?;
        // SAFETY: ^^ This is not portable, instead we zero it out!
        // Write HMAC placeholder (zeroed out) to reserve space for HMAC tag.
        let hmac_placeholder = [0u8; HMAC_TAG_SIZE];
        let mut nwrite = 0;
        while nwrite < hmac_placeholder.len() {
            #[expect(clippy::arithmetic_side_effects)]
            match write(&enc_fd, &hmac_placeholder[nwrite..]) {
                Ok(0) => return Err(Errno::EINVAL),
                Ok(n) => nwrite += n,
                Err(Errno::EINTR) => continue,
                Err(errno) => return Err(errno),
            }
        }

        // Write the IV to the file.
        let buf = iv.as_ref();
        let mut nwrite = 0;
        while nwrite < buf.len() {
            #[expect(clippy::arithmetic_side_effects)]
            match write(&enc_fd, &buf[nwrite..]) {
                Ok(0) => return Err(Errno::EINVAL),
                Ok(n) => nwrite += n,
                Err(Errno::EINTR) => continue,
                Err(errno) => return Err(errno),
            }
        }

        Ok(())
    }

    /// Feed plaintext into AES & HMAC algorithm sockets.
    fn aes_feed<
        Fd1: AsFd,
        Fd2: AsFd,
        Fd3: AsFd,
        Fd4: AsFd,
        Fd5: AsFd,
        Fd6: AsFd,
        Fd7: AsFd,
        Fd8: AsFd,
    >(
        crypt_fds: (Fd1, Fd2),
        sock_fds: (Fd3, Fd4),
        pipe_fds: (Fd5, Fd6, Fd7, Fd8),
        options: AesWorkerOpts,
    ) -> Result<(), Errno> {
        let (crypt_fd, enc_fd) = crypt_fds;
        let (sock_enc, sock_mac) = sock_fds;
        let (pipe_rd_enc, pipe_wr_enc, pipe_rd_mac, pipe_wr_mac) = pipe_fds;
        let memfd = options.memfd;
        let file_mode = options.file_mode;
        let crypt_path = options.crypt_path;
        debug!("ctx": "aes", "op": "copy_text",
            "path": crypt_path, "memfd": memfd, "mode": file_mode,
            "msg": "feed plaintext into AES & HMAC algorithm sockets");

        // Feed plaintext via zero-copy into the kernel socket.
        let mut nflush = 0usize;
        loop {
            let nfeed = retry_on_eintr(|| {
                splice(
                    &crypt_fd,
                    None,
                    &pipe_wr_enc,
                    None,
                    PIPE_BUF_ALG,
                    SpliceFFlags::empty(),
                )
            })?;
            if nfeed == 0 {
                break;
            }

            // splice(2) plaintext into AES socket.
            Self::splice_all(&pipe_rd_enc, &sock_enc, nfeed, SpliceFFlags::SPLICE_F_MORE)?;

            nflush = nflush.checked_add(nfeed).ok_or(Errno::EOVERFLOW)?;
            while nflush >= BLOCK_SIZE {
                let rem = nflush.checked_rem(BLOCK_SIZE).ok_or(Errno::EOVERFLOW)?;
                let len = nflush.checked_sub(rem).ok_or(Errno::EOVERFLOW)?;

                // splice(2) len bytes of ciphertext from AES socket into enc pipe.
                let n = Self::splice_nonzero(
                    &sock_enc,
                    &pipe_wr_enc,
                    len,
                    SpliceFFlags::SPLICE_F_MORE,
                )?;

                // Duplicate data from encryption pipe to the MAC pipe using tee(2).
                Self::tee_all(&pipe_rd_enc, &pipe_wr_mac, n)?;

                // Splice encrypted data to output file.
                Self::splice_all(&pipe_rd_enc, &enc_fd, n, SpliceFFlags::empty())?;
                nflush = nflush.checked_sub(n).ok_or(Errno::EOVERFLOW)?;

                // Splice duplicated data to HMAC socket.
                Self::splice_all(&pipe_rd_mac, &sock_mac, n, SpliceFFlags::SPLICE_F_MORE)?;
            }
        }

        // Flush the final batch.
        while nflush > 0 {
            // Finalize encryption with `false`.
            //
            // Some kernel versions may incorrectly return EINVAL here.
            // Gracefully handle this errno and move on.
            match aes_ctr_enc(&sock_enc, &[], None, false) {
                Ok(_) | Err(Errno::EINVAL) => {}
                Err(errno) => return Err(errno),
            }

            let len = nflush.min(PIPE_BUF_ALG);
            let n = Self::splice_nonzero(&sock_enc, &pipe_wr_enc, len, SpliceFFlags::empty())?;

            // Duplicate data from encryption pipe to the MAC pipe using tee(2).
            Self::tee_all(&pipe_rd_enc, &pipe_wr_mac, n)?;

            // Splice encrypted data to output file.
            Self::splice_all(&pipe_rd_enc, &enc_fd, n, SpliceFFlags::empty())?;
            nflush = nflush.checked_sub(n).ok_or(Errno::EOVERFLOW)?;

            // Splice duplicated data to HMAC socket.
            Self::splice_all(&pipe_rd_mac, &sock_mac, n, SpliceFFlags::SPLICE_F_MORE)?;
        }

        Ok(())
    }

    /// Write HMAC tag to the encrypted file.
    fn hmac_write<Fd1: AsFd, Fd2: AsFd>(
        enc_fd: Fd1,
        sock_mac: Fd2,
        options: AesWorkerOpts,
    ) -> Result<(), Errno> {
        let memfd = options.memfd;
        let file_mode = options.file_mode;
        let crypt_path = options.crypt_path;

        // Finalize HMAC computation and retrieve the tag.
        // SAFETY: This is the only place where we use
        // the read(2) system call hence we allow read(2)
        // system call up to 32 bytes which is the size
        // of the HMAC.
        let hmac_tag = hmac_sha256_fini(&sock_mac)?;

        // Seek back to the position after the magic header.
        #[expect(clippy::cast_possible_wrap)]
        lseek64(&enc_fd, CRYPT_MAGIC.len() as i64, Whence::SeekSet)?;

        // Write the HMAC tag to the file.
        debug!("ctx": "aes", "op": "write_hmac",
            "path": crypt_path, "memfd": memfd, "mode": file_mode,
            "msg": "write HMAC tag into header");
        let buf = hmac_tag.as_slice();
        let mut nwrite = 0;
        while nwrite < buf.len() {
            #[expect(clippy::arithmetic_side_effects)]
            match write(&enc_fd, &buf[nwrite..]) {
                Ok(0) => return Err(Errno::EINVAL),
                Ok(n) => nwrite += n,
                Err(Errno::EINTR) => continue,
                Err(errno) => return Err(errno),
            }
        }

        Ok(())
    }

    // Drain exactly N bytes with checked subtraction.
    fn splice_all<Fd1: AsFd, Fd2: AsFd>(
        src: Fd1,
        dst: Fd2,
        mut len: usize,
        flags: SpliceFFlags,
    ) -> Result<(), Errno> {
        while len > 0 {
            let n = Self::splice_nonzero(&src, &dst, len, flags)?;
            len = len.checked_sub(n).ok_or(Errno::EOVERFLOW)?;
        }
        Ok(())
    }

    // Duplicate exactly N bytes in PIPE_BUF_ALG-bounded chunks.
    fn tee_all<Fd1: AsFd, Fd2: AsFd>(src: Fd1, dst: Fd2, mut len: usize) -> Result<(), Errno> {
        while len > 0 {
            let n = Self::tee_nonzero(&src, &dst, len)?;
            len = len.checked_sub(n).ok_or(Errno::EOVERFLOW)?;
        }
        Ok(())
    }

    // splice(2) that must move >0 or EBADMSG.
    fn splice_nonzero<Fd1: AsFd, Fd2: AsFd>(
        src: Fd1,
        dst: Fd2,
        len: usize,
        flags: SpliceFFlags,
    ) -> Result<usize, Errno> {
        let n = retry_on_eintr(|| splice(&src, None, &dst, None, len, flags))?;
        if n > 0 {
            Ok(n)
        } else {
            Err(Errno::EBADMSG)
        }
    }

    // tee(2) that must move >0 or EBADMSG.
    fn tee_nonzero<Fd1: AsFd, Fd2: AsFd>(src: Fd1, dst: Fd2, len: usize) -> Result<usize, Errno> {
        let n = retry_on_eintr(|| tee(&src, &dst, len, SpliceFFlags::empty()))?;
        if n > 0 {
            Ok(n)
        } else {
            Err(Errno::EBADMSG)
        }
    }
}
