"""Reusable workflow helpers for examples and small supervised UFP studies."""
from __future__ import annotations
from typing import Sequence
import ase
import numpy as np
import torch
from ufp.terms import ElementOneBodyTerm, UFPModel
from ufp.training import ASEAtomsDataset
[docs]
def onebody_reference_energy(model: UFPModel) -> float:
"""Return the scalar value from a single ``ElementOneBodyTerm`` model."""
if len(model.onebody_terms) != 1:
raise ValueError("expected exactly one one-body term")
term = model.onebody_terms[0]
if not isinstance(term, ElementOneBodyTerm):
raise TypeError("expected the one-body term to be ElementOneBodyTerm")
return float(term.values.detach().cpu().reshape(-1)[0].item())
[docs]
def onebody_reference_energies(model: UFPModel) -> dict[int, float]:
"""Return all element one-body values keyed by atomic number."""
values_by_type: dict[int, float] = {}
for term in model.onebody_terms:
if not isinstance(term, ElementOneBodyTerm):
raise TypeError("expected one-body terms to be ElementOneBodyTerm")
values = term.values.detach().cpu().reshape(-1)
if values.numel() != len(term.atomic_types):
raise ValueError("one-body values do not match declared atomic types")
for atomic_type, value in zip(term.atomic_types, values, strict=True):
if int(atomic_type) in values_by_type:
raise ValueError(f"duplicate one-body value for Z={atomic_type}")
values_by_type[int(atomic_type)] = float(value.item())
return values_by_type
def _target_energy_from_atoms(
atoms: ase.Atoms,
*,
energy_key: str | None,
) -> float:
"""Read one total-energy target from an ASE object."""
if energy_key is not None:
if energy_key in atoms.info:
return float(atoms.info[energy_key])
if energy_key in atoms.arrays:
value = np.asarray(atoms.arrays[energy_key], dtype=float)
if value.size != 1:
raise ValueError(f"`{energy_key}` must contain one scalar energy")
return float(value.reshape(-1)[0])
raise KeyError(f"ASE atoms object is missing energy target `{energy_key}`")
return float(atoms.get_potential_energy())
[docs]
def fit_element_onebody_energies(
atoms_list: Sequence[ase.Atoms],
energies: Sequence[float] | None = None,
*,
atomic_types: Sequence[int] | None = None,
energy_key: str | None = None,
rcond: float | None = None,
) -> dict[int, float]:
"""
Fit composition-only per-element reference energies by least squares.
Args:
atoms_list: ASE structures whose elemental compositions define the linear
system.
energies: Optional total energies, one per structure. If omitted, energies
are read with ``atoms.get_potential_energy()`` unless ``energy_key`` is
set.
atomic_types: Atomic numbers to fit. When omitted, all elements observed in
``atoms_list`` are fitted.
energy_key: Optional key in ``atoms.info`` or ``atoms.arrays`` used when
``energies`` is omitted.
rcond: Cutoff passed to ``numpy.linalg.lstsq``.
Returns:
Mapping from atomic number to fitted reference energy in eV.
Raises:
KeyError: If ``energy_key`` is set but missing from a structure.
TypeError: If ``atoms_list`` contains non-ASE objects.
ValueError: If the inputs are empty or have inconsistent lengths.
"""
structures = list(atoms_list)
if not structures:
raise ValueError("`atoms_list` must contain at least one ASE structure")
for atoms in structures:
if not isinstance(atoms, ase.Atoms):
raise TypeError(f"`atoms_list` must contain ase.Atoms, got {type(atoms)}")
if atomic_types is None:
fitted_types = tuple(
sorted({int(number) for atoms in structures for number in atoms.numbers})
)
else:
fitted_types = tuple(sorted(set(int(number) for number in atomic_types)))
if not fitted_types:
raise ValueError("at least one atomic type must be fitted")
if energies is None:
targets = np.asarray(
[
_target_energy_from_atoms(atoms, energy_key=energy_key)
for atoms in structures
],
dtype=float,
)
else:
targets = np.asarray(list(energies), dtype=float)
if targets.shape != (len(structures),):
raise ValueError(
"`energies` must contain exactly one scalar energy per structure"
)
column_by_type = {
atomic_type: column for column, atomic_type in enumerate(fitted_types)
}
counts = np.zeros((len(structures), len(fitted_types)), dtype=float)
for row, atoms in enumerate(structures):
atomic_numbers, atomic_counts = np.unique(
np.asarray(atoms.numbers, dtype=int),
return_counts=True,
)
for atomic_number, atomic_count in zip(
atomic_numbers,
atomic_counts,
strict=True,
):
column = column_by_type.get(int(atomic_number))
if column is not None:
counts[row, column] = float(atomic_count)
values, *_ = np.linalg.lstsq(counts, targets, rcond=rcond)
return {
atomic_type: float(values[index])
for index, atomic_type in enumerate(fitted_types)
}
[docs]
def initialize_onebody_terms_from_atoms(
model: UFPModel,
atoms_list: Sequence[ase.Atoms],
energies: Sequence[float] | None = None,
*,
energy_key: str | None = None,
rcond: float | None = None,
) -> dict[int, float]:
"""Initialize trainable element one-body terms from composition least squares."""
trainable_terms = [
term
for term in model.onebody_terms
if isinstance(term, ElementOneBodyTerm) and term.values.requires_grad
]
if not trainable_terms:
return {}
fitted = fit_element_onebody_energies(
atoms_list,
energies,
energy_key=energy_key,
rcond=rcond,
)
missing_types = sorted(
set(fitted).difference(
int(atomic_type)
for term in trainable_terms
for atomic_type in term.atomic_types
)
)
if missing_types:
raise ValueError(
"trainable ElementOneBodyTerm coverage is missing observed atomic "
f"types {missing_types}"
)
unobserved_types = sorted(
{
int(atomic_type)
for term in trainable_terms
for atomic_type in term.atomic_types
}.difference(fitted)
)
if unobserved_types:
raise ValueError(
"can not initialize one-body terms for unobserved atomic types "
f"{unobserved_types}"
)
with torch.no_grad():
for term in trainable_terms:
values = torch.tensor(
[fitted[int(atomic_type)] for atomic_type in term.atomic_types],
dtype=term.values.dtype,
device=term.values.device,
)
term.values.copy_(values)
return {
int(atomic_type): float(value)
for term in trainable_terms
for atomic_type, value in zip(
term.atomic_types,
term.values.detach().cpu().reshape(-1).tolist(),
strict=True,
)
}
[docs]
def initialize_onebody_terms_from_dataset(
model: UFPModel,
dataset: ASEAtomsDataset,
*,
rcond: float | None = None,
) -> dict[int, float]:
"""Initialize trainable element one-body terms from an ASE training dataset."""
atoms_list = [sample.atoms for sample in dataset] # type: ignore[attr-defined]
energies = [sample.energy for sample in dataset] # type: ignore[attr-defined]
if any(energy is None for energy in energies):
return {}
return initialize_onebody_terms_from_atoms(
model,
atoms_list,
[float(energy) for energy in energies if energy is not None],
rcond=rcond,
)