#!/usr/bin/env python3
#
# Copyright (c) 2025 The NetBSD Foundation, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#

"""reject_normcasecollision is a hook to avoid case/normalization issues.

Usage:
  [hooks]
  pretxnchangegroup.reject_normcasecollision =
     python:....reject_normcasecollision.hook
"""

import os
import re
import time
import unicodedata

from typing import Iterable
from typing import Iterator
from typing import Optional

from mercurial.i18n import _
from mercurial.utils import stringutil
from mercurial import (
    error,
    localrepo,
    match,
    pycompat,
    ui,
)


def _pathprefixes(p: str) -> Iterator[str]:
    """Yield each directory prefix of a path, including the path itself.
    """
    assert not os.path.isabs(p)
    while p:
        yield p
        p = os.path.dirname(p)


def _pathcompcount(p: str) -> int:
    """Return the number of components in a reltaive path p.

    '' has no components, 'foo' has one component, 'foo/bar' has two,
    &c.
    """
    assert not os.path.isabs(p)
    n = 0
    while p:
        n += 1
        p = os.path.dirname(p)
    return n


def _pathncompprefix(p: str, n: int) -> str:
    """Return the n-component prefix of relative path p.

    >>> _pathncompprefix('foo/bar/baz/quux', 2)
    'foo/bar'
    """
    while _pathcompcount(p) > n:
        p = os.path.dirname(p)
    return p


def _dirprefixsetmatcher(dirprefixes: Iterable[str]) -> re.Pattern:
    """Return a compiled regexp matching any given directory prefix.

    >>> m = _dirprefixsetmatcher(['foo/bar', 'foo/bars'])
    >>> m.match('foo')
    None
    >>> m.match('foo/')
    None
    >>> m.match('foo/ba')
    None
    >>> m.match('foo/bar')
    <re.Match object; span=(0, 7), match='foo/bar'>
    >>> m.match('foo/bar/')
    <re.Match object; span=(0, 8), match='foo/bar/'>
    >>> m.match('foo/bar/baz')
    <re.Match object; span=(0, 8), match='foo/bar/'>
    >>> m.match('foo/bars')
    <re.Match object; span=(0, 8), match='foo/bars'>
    >>> m.match('foo/barx')
    None
    """
    # Match the start of the string through either a / or the end of
    # the string so that we only select full components.
    #
    s = '^(' + \
        '|'.join(re.escape(prefix) for prefix in dirprefixes) + \
        ')(/|$)'
    return re.compile(s)


def hook(
    ui: ui.ui,
    repo: localrepo.localrepository,
    hooktype: bytes,
    node: Optional[bytes] = None,
    **kwargs,
) -> None:
    if hooktype != b'pretxnchangegroup':
        raise error.Abort(
            _(b'Unsupported hook type %r') % pycompat.bytestr(hooktype)
        )

    # XXX test rename and copy too
    # XXX test merge commits
    # XXX test merging changes that collide without adding new files
    # XXX test new colliding files under just case-folding
    # XXX test new colliding files under case-folding AND normalization
    # XXX test new colliding directories
    # XXX test invalid UTF-8 paths
    # XXX test multiple rejected files per changeset
    # XXX test multiple rejected changesets per group
    # XXX test adding a set of files colliding with itself, not others
    # XXX test adding a set of directories colliding with itself, not others
    # XXX test a prefix that collides but is not a full file/directory name
    # XXX test a commit that doesn't add any files

    # Notify whoever's pushing of what we're doing -- it might incur an
    # appreciable delay.
    ui.status(
        _(b'checking encoding/normalization/case-sensitivity...\n'))
    t0 = time.time()

    # For each revision number revno and file with path fbytes,
    # badrevs[revno][fbytes], if set, is a list of error messages
    # explaining why we are rejecting this revision for this file path.
    #
    badrevs = {}

    # Iterate over all the revisions being added to verify each
    # revision is kosher -- any that are not will be recorded in
    # badrevs to be reported in a batch at the end.
    #
    ctx = repo.unfiltered()[node]
    for revno in repo.changelog.revs(start=ctx.rev()):
        rev = repo[revno]

        def bad(fbytes: bytes, reason: bytes) -> None:
            if revno not in badrevs:
                badrevs[revno] = {}
            if fbytes not in badrevs[revno]:
                badrevs[revno][fbytes] = []
            badrevs[revno][fbytes].append(reason)

        # Verify each file's path is valid UTF-8 in Normal Form C,
        # containing only code points assigned in Unicode 3.2.
        #
        filesadded = set()
        for fbytes in sorted(rev.filesadded()):

            # Decode the file's path as UTF-8.  If that fails, reject
            # it because of that and move on to the other files.
            #
            try:
                f = fbytes.decode('utf-8')
            except UnicodeError:
                bad(fbytes, _(b'Path is invalid UTF-8'))
                del fbytes
                continue

            # If the path isn't in Normal Form C, reject it.
            #
            if not unicodedata.is_normalized('NFC', f):
                bad(fbytes, _(b'Path is not Unicode NFC'))
                del f
                del fbytes
                continue

            # If the code points in path aren't all in Unicode 3.2,
            # reject it.
            #
            for cp in f:
                if unicodedata.ucd_3_2_0.name(cp, None) is None:
                    bad(fbytes, _(b'Path has unassigned code points'))
                    del cp
                    break
                del cp
            else:
                filesadded.add(f)
            del f
            del fbytes

        # Enumerate the paths of all the files _and directories_ added
        # by this changeset.  Map each case-folded path back to all
        # original paths.  Record any collisions we find among files
        # we're adding.
        #
        # XXX Don't report common prefixes twice.
        #
        allfilesdirsadded = {}
        fileprefixseen = set()
        for f in filesadded:
            for fp in _pathprefixes(f):
                if fp in fileprefixseen:
                    del fp
                    break
                fileprefixseen.add(fp)
                fpc = fp.casefold()
                if fpc == fp:
                    fpc = fp
                if fpc not in allfilesdirsadded:
                    allfilesdirsadded[fpc] = set()
                allfilesdirsadded[fpc].add(fp)
                del fpc
                del fp
            del f
        del fileprefixseen
        for fpc in allfilesdirsadded:
            fpset = allfilesdirsadded[fpc]
            if len(fpset) != 1:
                for fp in fpset:
                    bad(fp.encode('utf-8'),
                        _(b'Path collision: %r') %
                        (sorted(fp.encode('utf-8') for fp in fpset),))
                    del fp
            del fpset
            del fpc

        # For each parent of this revision, if the parent wasn't
        # already dinged, check for collisions relative to that
        # parent's existing files.
        #
        # XXX Optimize this for multi-changeset groups by tracking
        # files added and removed among the new changesets so we only
        # have to iterate over all files in the manifests of parents
        # that were already in the repository.  Not trivial: we have to
        # create and track a list of prefixes all the way to each head,
        # so we can match it _once_ against the parent already in the
        # repository.
        #
        for parentctx in rev.parents():
            if parentctx.rev() in badrevs:
                del parentctx
                continue

            # Copy the dictionary of case-folded file prefixes, so we
            # can delete from it as we go, and build a regular
            # expression matching them.
            #
            # Note: Python re.IGNORECASE does not actually work.  For
            # example, re.compile(r'nussschnecke', flags=re.IGNORECASE)
            # fails to match 'Nußschnecke'.  So we have to casefold the
            # input anyway.
            #
            filesdirsadded = allfilesdirsadded.copy()
            pattern = _dirprefixsetmatcher(filesdirsadded)

            # For each file and directory this revision adds, check for
            # collisions with existing files and directories.
            #
            # Mercurial doesn't provide a convenient way to list the
            # files and directories in a directory -- only ways to list
            # complete paths.  So, we bite the bullet and iterate over
            # everything and check for prefixes matching the files or
            # directories we're adding.
            #
            # We use several tricks to reduce the time spent on each
            # path in the repository:
            #
            # 1. We restrict the search by a regular expression of all
            #    the file prefixes.  Regular expression matching is
            #    surprisingly cheap in Python (really, everything else
            #    -- arithmetic, data structure access, variable
            #    reference -- is surprisingly expensive).
            #
            # 2. We remove each prefix we have already checked -- there
            #    is no sense in checking the same directory for more
            #    collisions.
            #
            # 3. We update the regular expression when we remove any
            #    prefixes so we can skip paths more quickly next time.
            #
            # This is optimized for existing trees with very large
            # numbers of files (say, hundreds of thousands), and new
            # changesets that add relatively small numbers of files.
            # It is unlikely to perform well for adding a large number
            # of files to a small existing repository, like committing
            # a snapshot of an operating system all at once.
            #
            collisions = {}
            for pbytes in parentctx.manifest().walk(match.alwaysmatcher()):
                p = pbytes.decode('utf-8')
                pc = p.casefold()
                if pc == p:
                    pc = p
                if not pattern.match(pc):
                    del pc
                    del p
                    del pbytes
                    continue

                filesdirsdone = None
                for fpc in filesdirsadded:
                    if pc.startswith(fpc) and \
                       (pc == fpc or pc[len(fpc)] == '/'):

                        # The case-folded path pc of a file already in
                        # the repository starts with a case-folded
                        # prefix fpc of a path we're adding.  Check
                        # whether all the originals match.  If not,
                        # report a collision.
                        #
                        for fp in filesdirsadded[fpc]:
                            if p.startswith(fp):
                                del fp
                                continue
                            assert fpc == fp.casefold()
                            n = _pathcompcount(fp)
                            pp = _pathncompprefix(p, n)
                            collisions[fp] = pp
                            del pp
                            del n
                            del fp

                        # Don't bother checking files that match _only_
                        # this (case-folded) prefix any more: we have
                        # already recorded any such collisions.  Add it
                        # to the list of prefixes we are done matching.
                        #
                        if filesdirsdone is None:
                            filesdirsdone = [fpc]
                        else:
                            filesdirsdone.append(fpc)
                    del fpc

                # If we matched any prefixes, stop searching for them
                # on the next iteration: delete from filesdirsadded,
                # and recompute the regular expression filter.  We have
                # to do the deletions outside the loop (dictionary
                # iteration forbids interleaved deletion), and it's
                # cheaper to recompute the regular expression once in a
                # batch before we next use it.
                #
                if filesdirsdone is not None:
                    for fpc in filesdirsdone:
                        del filesdirsadded[fpc]
                        del fpc
                    pattern = _dirprefixsetmatcher(filesdirsadded)
                del filesdirsdone

                del pc
                del p
                del pbytes

            del pattern
            del filesdirsadded

            # For each collision with a prefix of an existing file,
            # report the _shortest_ matching prefix.  For example, if
            # there already is foo/bar/baz, and the commit adds
            # foo/BAR/baz, report the collision foo/bar <-> foo/BAR,
            # not the collision foo/bar/baz <-> foo/BAR/baz.
            #
            for fp in collisions:
                for fpp in _pathprefixes(fp):
                    if fpp != fp and fpp in collisions:
                        del fpp
                        break
                    del fpp
                else:
                    pp = collisions[fp]
                    bad(fp.encode('utf-8'),
                        _(b'File name collides with %r from parent %s') %
                        (pp.encode('utf-8'), parentctx))
                    del pp
                del fp
            del collisions
            del parentctx

        del allfilesdirsadded
        del rev
        del revno

    t1 = time.time()
    ui.status(
        _(b'%f sec to check encoding/normalization/case-sensitivity\n') %
        (t1 - t0,))

    # If any revisions were rejected, identify each revision and
    # explain why.  Truncate the description to 40 characters, but
    # don't treat it as toxic waste; we are reporting this back to the
    # caller who provided the description anyway.
    #
    # XXX Consider limiting the total length of this message.
    #
    if badrevs:
        ui.error(_(b'Rejecting the following revisions because:\n'))
        for revno in sorted(badrevs):
            ui.error(
                b'%s (%s):\n%s\n' %
                (repo[revno],
                 stringutil.firstline(repo[revno].description())[:40],
                 b'\n'.join(
                     b'- %r: %s' % (f, reason)
                     for f in sorted(badrevs[revno])
                     for reason in badrevs[revno][f]))
            )
            del revno
        raise error.Abort(
            _(b'encoding/normalization/case-sensitivity trouble found'))
    else:
        ui.status(
            _(b'no encoding/normalization/case-sensitivity trouble found\n'))
