Source code for hofmann.model.atom_data

"""Validated container for per-atom metadata arrays."""

from __future__ import annotations

from collections.abc import Iterator, Mapping
from types import MappingProxyType

import numpy as np

from hofmann.model.colour import _is_categorical_missing


def _compute_global_range(
    arr: np.ndarray,
) -> tuple[float, float] | None:
    """Compute the global ``(min, max)`` for a 2-D numeric array.

    Returns ``None`` for 1-D arrays, categorical (string/object)
    dtypes, empty arrays, or arrays where every value is NaN.
    """
    if arr.ndim != 2 or arr.dtype.kind in ("U", "O"):
        return None
    if arr.size == 0:
        return None
    if np.all(np.isnan(arr)):
        return None
    lo = float(np.nanmin(arr))
    hi = float(np.nanmax(arr))
    return (lo, hi)


def _compute_global_labels(arr: np.ndarray) -> tuple[str, ...] | None:
    """Return the unique non-missing labels in a 2-D categorical array.

    Returns ``None`` for 1-D arrays or non-categorical (numeric)
    dtypes.  Missing values (``None``, ``""``, NaN) are excluded.

    Labels are returned in first-encountered order across
    ``arr.ravel()``.  This ordering is load-bearing: downstream
    colourmap assignment (``_resolve_categorical``) indexes into this
    tuple via :func:`enumerate`, so the order determines which label
    gets which colour.  Preserve the insertion-order semantics;
    sorting or otherwise reordering the result will silently change
    per-atom colours.
    """
    if arr.ndim != 2 or arr.dtype.kind not in ("U", "O"):
        return None
    seen: dict[str, None] = {}
    for v in arr.ravel():
        if _is_categorical_missing(v):
            continue
        s = str(v)
        if s not in seen:
            seen[s] = None
    if not seen:
        return None
    return tuple(seen)



[docs]
class AtomData(Mapping[str, np.ndarray]):
    """Per-atom metadata container.

    The supported way to obtain an ``AtomData`` is via
    :attr:`~hofmann.StructureScene.atom_data`; the class is not
    re-exported from ``hofmann`` or ``hofmann.model``, and direct
    construction is considered an internal implementation detail.
    User-facing access goes through
    :attr:`~hofmann.StructureScene.atom_data` for reads,
    :meth:`~hofmann.StructureScene.set_atom_data`,
    :meth:`~hofmann.StructureScene.del_atom_data`, and
    :meth:`~hofmann.StructureScene.clear_2d_atom_data` for writes.

    Stores named per-atom arrays.  Each value is either a 1-D array
    of shape ``(n_atoms,)`` (static across the trajectory) or a 2-D
    array of shape ``(m, n_atoms)`` where ``m`` is the trajectory
    length the caller declares at write time via the
    ``expected_frames`` kwarg on ``_set``.  All stored 2-D entries
    in a single container must share the same ``m``; this cross-
    entry invariant is enforced at assignment.

    The container holds no cached frame count between calls; each
    ``_set`` is told ``expected_frames`` by the caller, and the
    invariant is re-derived from the stored data on every call.
    Frame consistency is enforced at two sites:

    - At assignment, via ``_set`` calling ``_check_2d_consistency``
      with ``pending={key: arr}``, validating the prospective
      post-write state against ``expected_frames``.  Both the
      incoming array and any already-stored 2-D entries not being
      overridden are checked.
    - At render, via
      :meth:`~hofmann.StructureScene.render_mpl` (and friends)
      calling the scene's private ``_validate_for_render`` helper,
      which in turn calls ``_check_2d_consistency`` with no
      ``pending``.  This validates the current stored state against
      ``len(scene.frames)`` as a backstop that catches the specific
      case where ``scene.frames`` is mutated after the last write
      but before the next render.

    Inherits from :class:`collections.abc.Mapping` (not
    :class:`~collections.abc.MutableMapping`).  Mutation goes
    through the private ``_set``, ``_del``, and ``_clear_2d``
    methods; no ``ad[key] = value`` or ``del ad[key]`` shortcut
    exists.  Assigned values are always copied via
    :func:`numpy.array` -- including existing numpy arrays -- so
    the container owns the buffer and the caller's source array is
    left untouched.

    .. note::

       Stored arrays are returned read-only.  In-place mutation
       (e.g. ``ad["charge"][0] = 99``) raises
       ``ValueError: assignment destination is read-only``.  To
       update values, pass a fresh array through
       :meth:`~hofmann.StructureScene.set_atom_data`, which
       re-validates the shape and recomputes the :attr:`ranges`
       and :attr:`labels` entries for the key.  Only the array
       buffer is frozen -- for ``object``-dtype arrays, any
       mutable objects stored inside remain mutable.

    Attributes:
        n_atoms: The number of atoms the container was built for.
            Fixed at construction and not mutable.
        ranges: Read-only mapping of keys to ``(min, max)`` tuples
            for 2-D numeric arrays, or ``None`` for keys that do
            not have a meaningful numeric range (1-D arrays,
            categorical arrays, empty arrays, all-NaN numeric
            arrays).  Entries are added on assignment, replaced on
            reassignment, and removed on deletion.
        labels: Read-only mapping of keys to tuples of unique
            non-missing categorical labels, or ``None`` for keys
            without a meaningful label set (1-D arrays, numeric
            dtypes, categorical arrays with no non-missing values).
            Missing values (``None``, ``""``, NaN) are excluded
            from the label set.  Entries are added on assignment,
            replaced on reassignment, and removed on deletion.

    For 2-D arrays, ``ranges`` is populated for numeric dtypes and
    ``labels`` for categorical dtypes; the other side is always
    ``None``.  Either side may itself be ``None`` for empty arrays
    or arrays containing only missing values.  For 1-D arrays, both
    are ``None``.

    Args:
        n_atoms: Number of atoms in the scene.  Non-negative.

    Raises:
        ValueError: If *n_atoms* is negative.
    """


[docs]
    def __init__(self, *, n_atoms: int) -> None:
        if n_atoms < 0:
            raise ValueError(f"n_atoms must be non-negative, got {n_atoms}")
        self._n_atoms = n_atoms
        self._data: dict[str, np.ndarray] = {}
        self._ranges: dict[str, tuple[float, float] | None] = {}
        self._labels: dict[str, tuple[str, ...] | None] = {}
        self._ranges_view: Mapping[str, tuple[float, float] | None] = (
            MappingProxyType(self._ranges)
        )
        self._labels_view: Mapping[str, tuple[str, ...] | None] = (
            MappingProxyType(self._labels)
        )


    @property
    def n_atoms(self) -> int:
        return self._n_atoms

    @property
    def ranges(self) -> Mapping[str, tuple[float, float] | None]:
        return self._ranges_view

    @property
    def labels(self) -> Mapping[str, tuple[str, ...] | None]:
        return self._labels_view

    def _check_2d_consistency(
        self,
        expected: int,
        *,
        pending: Mapping[str, np.ndarray] | None = None,
    ) -> None:
        """Check the 2-D shape invariant against *expected* for the
        prospective post-operation state.

        The prospective state is the current stored state with any
        keys in *pending* treated as overrides: a pending entry
        replaces the stored entry of the same key (for reassignment)
        or adds a new entry (for a first write).  Stored entries
        whose keys appear in *pending* are skipped during validation
        because they will no longer exist in the post-operation
        state.

        Called at two sites, both with ``expected=len(scene.frames)``:

        - ``_set`` with ``pending={key: arr}`` to validate the state
          that would result from storing *arr* under *key*.
        - ``StructureScene._validate_for_render`` with no *pending*
          (current stored state is the prospective state) at the
          start of every public ``render_*`` method.

        Pending entries are checked before stored entries so that a
        user passing a wrong-shape array gets an actionable
        call-site error, not a confusing stale-stored error.

        Stored entries form an equivalence class under the cross-
        entry invariant (all 2-D share the same ``shape[0]``), so
        once the walk finds a non-overridden 2-D entry that passes,
        it short-circuits -- any other non-overridden 2-D entries
        would report the same shape.

        Args:
            expected: The ``shape[0]`` value to check against.
            pending: Optional mapping of keys to arrays being
                written.  Pending arrays with ``ndim != 2`` are
                ignored for this check.

        Raises:
            ValueError: If any pending or non-overridden stored 2-D
                entry has ``shape[0] != expected``.
        """
        pending = pending or {}
        # Check pending entries first: errors on these are caused by
        # the user's current call, so their messages are the most
        # actionable.
        for pending_key, arr in pending.items():
            if arr.ndim == 2 and arr.shape[0] != expected:
                raise ValueError(
                    f"atom_data[{pending_key!r}] has {arr.shape[0]} "
                    f"rows but {expected} frames were expected"
                )
        # Then check stored entries, skipping any overridden by
        # pending: those stored entries will be replaced by the
        # operation that is about to complete.
        for stored_key, arr in self._data.items():
            if stored_key in pending:
                continue
            if arr.ndim == 2:
                if arr.shape[0] != expected:
                    raise ValueError(
                        f"atom_data has stale 2-D entry "
                        f"{stored_key!r} sized for {arr.shape[0]} "
                        f"frames, but {expected} frames were "
                        f"expected; call "
                        f"scene.clear_2d_atom_data() to recover "
                        f"before reassigning"
                    )
                return

    def _set(
        self,
        key: str,
        value: object,
        *,
        expected_frames: int,
    ) -> None:
        """Store *value* under *key* with full validation.

        Private internal write method called from ``StructureScene``
        plumbing (``set_atom_data``, ``__init__``), not part of any
        public protocol.  The container inherits from
        :class:`~collections.abc.Mapping` (not
        :class:`~collections.abc.MutableMapping`), so there is no
        ``ad[key] = value`` shortcut for users.

        For 2-D values, the post-operation state (the current stored
        state with *key* replaced by *value*) must have every 2-D
        entry sized for *expected_frames*.  This catches both
        user-supplied arrays that do not match the scene's
        trajectory and stale stored entries left over from an
        earlier ``scene.frames`` mutation.  Reassigning a single 2-D
        entry with a new shape is allowed when it is the only 2-D
        entry, because the stored entry for *key* is treated as
        overridden by the pending write and skipped during the walk.

        Args:
            key: Entry key.
            value: Array-like input.
            expected_frames: The scene's current ``len(frames)``;
                required on every call because the container exists
                only to back a scene.

        Raises:
            ValueError: If *value* does not coerce to a 1-D array of
                length ``n_atoms`` or a 2-D array of shape
                ``(expected_frames, n_atoms)``, if a non-overridden
                stored 2-D entry is stale relative to
                *expected_frames*, or if the value has an
                unsupported dtype (only bool, integer, float,
                string, and object are accepted).
        """
        arr = np.array(value)
        # Input validation: ndim and per-atom shape.  Pure function of
        # *value*; no dependence on stored state.
        if arr.ndim == 1:
            if len(arr) != self._n_atoms:
                raise ValueError(
                    f"atom_data[{key!r}] must have length "
                    f"{self._n_atoms}, got {len(arr)}"
                )
        elif arr.ndim == 2:
            if arr.shape[1] != self._n_atoms:
                raise ValueError(
                    f"atom_data[{key!r}] must have {self._n_atoms} "
                    f"columns (one per atom), got {arr.shape[1]}"
                )
        else:
            raise ValueError(
                f"atom_data[{key!r}] must be 1-D or 2-D, "
                f"got {arr.ndim}-D"
            )
        # Input validation: dtype whitelist.  Also a pure function of
        # *value*.  Must fire before any state-dependent check so a
        # user passing a bad-dtype array is told about the dtype
        # directly, rather than being sent on an unnecessary
        # ``clear_2d_atom_data()`` recovery for a write that was
        # never going to land.
        if arr.dtype.kind not in ("b", "i", "u", "f", "U", "O"):
            raise ValueError(
                f"atom_data[{key!r}] has unsupported dtype "
                f"{arr.dtype}; supported dtypes are bool, integer, "
                f"float, string, and object"
            )
        # State-dependent invariant: pending shape against expected
        # frames, and any non-overridden stored 2-D entries.  Runs
        # after all input checks so input errors take priority.
        if arr.ndim == 2:
            self._check_2d_consistency(
                expected_frames, pending={key: arr}
            )
        arr.flags.writeable = False
        new_range = _compute_global_range(arr)
        new_labels = _compute_global_labels(arr)
        self._data[key] = arr
        self._ranges[key] = new_range
        self._labels[key] = new_labels

    def __getitem__(self, key: str) -> np.ndarray:
        return self._data[key]

    def _del(self, key: str) -> None:
        """Remove an entry by key.

        Private internal delete method called from ``StructureScene``
        plumbing (``del_atom_data`` and ``clear_2d_atom_data``), not
        part of any public protocol.  The container inherits from
        :class:`~collections.abc.Mapping` (not
        :class:`~collections.abc.MutableMapping`), so there is no
        ``del ad[key]`` shortcut for users.

        Raises:
            KeyError: If *key* is not present.
        """
        del self._data[key]
        del self._ranges[key]
        del self._labels[key]

    def _clear_2d(self) -> None:
        """Remove all 2-D entries, leaving 1-D entries untouched.

        Private helper called from
        :meth:`~hofmann.StructureScene.clear_2d_atom_data`.  After this method
        runs, the cross-entry 2-D shape constraint is released, so a
        subsequent ``_set`` with a 2-D array of any ``shape[0]``
        will succeed.  1-D entries and their derived ``ranges`` /
        ``labels`` metadata are preserved.

        Uses ``_del`` internally so all bookkeeping for removed
        entries stays in one place.
        """
        keys_to_delete = [
            k for k, v in self._data.items() if v.ndim == 2
        ]
        for key in keys_to_delete:
            self._del(key)

    def __iter__(self) -> Iterator[str]:
        return iter(self._data)

    def __len__(self) -> int:
        return len(self._data)

    def __repr__(self) -> str:
        if not self._data:
            return "AtomData()"
        items = [
            f"{key!r}: {arr.shape}"
            for key, arr in self._data.items()
        ]
        return f"AtomData({{{', '.join(items)}}})"