"""Reusable workflow helpers for examples and small supervised UFP studies."""
from __future__ import annotations
from pathlib import Path
from typing import Sequence
import ase
import numpy as np
import torch
from ufp.leastsquares import FitSample
from ufp.terms import UFPModel
from ufp.training import ASEAtomsDataset
from ufp.workflows.data import SupervisedAtomsDataset
[docs]
def ase_training_dataset_from_frames(
frames: Sequence[ase.Atoms],
energies: Sequence[float],
forces: Sequence[object],
*,
indices: Sequence[int] | None = None,
force_masks: Sequence[object] | None = None,
) -> ASEAtomsDataset:
"""Build an ``ASEAtomsDataset`` from frame-level labels."""
selected_indices = (
list(range(len(frames)))
if indices is None
else [int(index) for index in indices]
)
normalized_force_masks = None if force_masks is None else tuple(force_masks)
if normalized_force_masks is not None and len(normalized_force_masks) != len(
selected_indices
):
raise ValueError("`force_masks` must contain one mask per selected frame")
selected_frames: list[ase.Atoms] = []
selected_energies: list[float] = []
selected_forces: list[np.ndarray] = []
selected_force_masks: list[np.ndarray] | None = (
None if normalized_force_masks is None else []
)
for selected_position, index in enumerate(selected_indices):
frame = frames[int(index)]
selected_frames.append(frame.copy())
selected_energies.append(float(energies[int(index)]))
selected_forces.append(np.asarray(forces[int(index)], dtype=float))
if selected_force_masks is not None:
assert normalized_force_masks is not None
selected_force_masks.append(
np.asarray(normalized_force_masks[int(selected_position)], dtype=bool)
)
return ASEAtomsDataset.from_atoms(
selected_frames,
energies=selected_energies,
forces=selected_forces,
force_masks=selected_force_masks,
)
[docs]
def fit_samples_from_dataset(
dataset: SupervisedAtomsDataset,
*,
indices: Sequence[int] | np.ndarray | None = None,
sample_weights: dict[int, float] | None = None,
energy_weight: float = 1.0,
force_weight: float = 1.0,
per_atom_weight: float = 1.0,
) -> list[FitSample]:
"""Convert selected labeled frames into least-squares samples."""
selected_indices = dataset.training_indices if indices is None else indices
weights = {} if sample_weights is None else sample_weights
energy_weight = float(energy_weight)
force_weight = float(force_weight)
per_atom_weight = float(per_atom_weight)
if energy_weight < 0.0:
raise ValueError("`energy_weight` must be non-negative")
if force_weight < 0.0:
raise ValueError("`force_weight` must be non-negative")
if per_atom_weight < 0.0:
raise ValueError("`per_atom_weight` must be non-negative")
samples: list[FitSample] = []
for index in selected_indices:
int_index = int(index)
sample_weight = float(weights.get(int_index, 1.0))
if sample_weight < 0.0:
raise ValueError("`sample_weights` values must be non-negative")
samples.append(
FitSample(
atoms=dataset.frames[int_index].copy(),
energy=float(dataset.energies[int_index]),
forces=dataset.forces[int_index],
energy_weight=sample_weight * energy_weight,
force_weight=sample_weight * force_weight,
per_atom_weight=sample_weight * per_atom_weight,
)
)
return samples
[docs]
def split_frames(
dataset: SupervisedAtomsDataset,
indices: Sequence[int] | np.ndarray,
) -> list[ase.Atoms]:
"""Return frame copies for one selected split."""
return [dataset.frames[int(index)].copy() for index in indices]
[docs]
def prediction_metrics_for_split(
model: UFPModel,
dataset: SupervisedAtomsDataset,
indices: Sequence[int] | np.ndarray,
*,
batch_size: int,
device: torch.device | str | None = None,
dtype: torch.dtype | None = None,
progress: bool = False,
) -> dict[str, np.ndarray | float]:
"""Predict a split and return standard energy/force metrics."""
from ufp.analysis import compute_energy_force_metrics, predict_atoms
index_array = np.asarray(indices, dtype=int)
predicted_energies, predicted_forces = predict_atoms(
model,
split_frames(dataset, index_array),
batch_size=batch_size,
device=device,
dtype=dtype,
progress=progress,
)
return compute_energy_force_metrics(
dataset.energies[index_array],
[dataset.forces[int(index)] for index in index_array],
dataset.sizes[index_array],
predicted_energies,
predicted_forces,
)
def _tensor_array(value: object) -> np.ndarray:
"""Return one tensor-like value as a detached NumPy array."""
return np.asarray(torch.as_tensor(value).detach().cpu(), dtype=float)
def _uncertainty_arrays_for_split(
model: UFPModel,
frames: Sequence[ase.Atoms],
sizes: np.ndarray,
*,
posterior: object,
aleatoric_variance: float | None,
aleatoric_noise_bundle: object | None,
energy_variance_scale: float,
include_forces: bool,
) -> dict[str, np.ndarray]:
"""Return uncertainty arrays for one prediction split."""
from ufp.uncertainty import predict_with_uncertainty
energy_epistemic = []
energy_aleatoric = []
energy_total = []
per_atom_epistemic = []
per_atom_aleatoric = []
per_atom_total = []
force_epistemic = []
force_aleatoric = []
force_total = []
for atoms in frames:
uncertainty = predict_with_uncertainty(
model,
atoms,
posterior, # type: ignore[arg-type]
aleatoric_variance=aleatoric_variance,
aleatoric_noise_bundle=aleatoric_noise_bundle, # type: ignore[arg-type]
include_forces=include_forces,
return_rows=False,
)
energy_epistemic.append(
float(_tensor_array(uncertainty.energy_epistemic_variance).reshape(-1)[0])
)
energy_aleatoric.append(
float(_tensor_array(uncertainty.energy_aleatoric_variance).reshape(-1)[0])
)
energy_total.append(
float(_tensor_array(uncertainty.energy_total_variance).reshape(-1)[0])
)
per_atom_epistemic.append(
_tensor_array(uncertainty.per_atom_energy_epistemic_variance).reshape(-1)
)
per_atom_aleatoric.append(
_tensor_array(uncertainty.per_atom_energy_aleatoric_variance).reshape(-1)
)
per_atom_total.append(
_tensor_array(uncertainty.per_atom_energy_total_variance).reshape(-1)
)
if include_forces:
force_epistemic.append(
_tensor_array(uncertainty.force_epistemic_variance).reshape(-1)
)
force_aleatoric.append(
_tensor_array(uncertainty.force_aleatoric_variance).reshape(-1)
)
force_total.append(
_tensor_array(uncertainty.force_total_variance).reshape(-1)
)
scale = float(energy_variance_scale)
if scale <= 0.0:
raise ValueError("`energy_variance_scale` must be positive")
energy_epistemic_array = scale * np.asarray(energy_epistemic, dtype=float)
energy_aleatoric_array = scale * np.asarray(energy_aleatoric, dtype=float)
energy_total_array = scale * np.asarray(energy_total, dtype=float)
arrays = {
"energy_epistemic_variance": energy_epistemic_array,
"energy_aleatoric_variance": energy_aleatoric_array,
"energy_total_variance": energy_total_array,
"energy_epistemic_std_per_atom": np.sqrt(
np.maximum(energy_epistemic_array, 0.0)
)
/ sizes,
"energy_total_std_per_atom": np.sqrt(np.maximum(energy_total_array, 0.0))
/ sizes,
"per_atom_energy_epistemic_variance": scale
* np.concatenate(per_atom_epistemic),
"per_atom_energy_aleatoric_variance": scale
* np.concatenate(per_atom_aleatoric),
"per_atom_energy_total_variance": scale * np.concatenate(per_atom_total),
}
if include_forces:
arrays.update(
{
"force_epistemic_variance_components": np.concatenate(force_epistemic),
"force_aleatoric_variance_components": np.concatenate(force_aleatoric),
"force_total_variance_components": np.concatenate(force_total),
}
)
return arrays
[docs]
def save_prediction_split(
model: UFPModel,
dataset: SupervisedAtomsDataset,
indices: Sequence[int] | np.ndarray,
*,
filename: Path,
batch_size: int,
device: torch.device | str | None = None,
dtype: torch.dtype | None = None,
progress: bool = False,
uncertainty_posterior: object | None = None,
uncertainty_aleatoric_variance: float | None = None,
uncertainty_aleatoric_noise_bundle: object | None = None,
uncertainty_energy_variance_scale: float = 1.0,
uncertainty_forces: bool = False,
) -> dict[str, np.ndarray | float]:
"""Predict one split, write a standard ``.npz``, and return metrics."""
from ufp.analysis import compute_energy_force_metrics, predict_atoms
index_array = np.asarray(indices, dtype=int)
frames = split_frames(dataset, index_array)
predicted_energies, predicted_forces = predict_atoms(
model,
frames,
batch_size=batch_size,
device=device,
dtype=dtype,
progress=progress,
)
true_forces = tuple(dataset.forces[int(index)] for index in index_array)
true_force_components = np.concatenate(
[np.asarray(force, dtype=float).reshape(-1) for force in true_forces]
)
predicted_force_components = np.concatenate(
[np.asarray(force, dtype=float).reshape(-1) for force in predicted_forces]
)
true_energies = dataset.energies[index_array]
sizes = dataset.sizes[index_array]
true_energy_per_atom = np.asarray(true_energies, dtype=float) / sizes
predicted_energy_per_atom = np.asarray(predicted_energies, dtype=float) / sizes
arrays = {
"indices": index_array,
"sizes": sizes,
"system_sizes": sizes,
"true_energies": true_energies,
"predicted_energies": np.asarray(predicted_energies, dtype=float),
"true_energy_per_atom": true_energy_per_atom,
"predicted_energy_per_atom": predicted_energy_per_atom,
"true_force_components": true_force_components,
"predicted_force_components": predicted_force_components,
}
if uncertainty_posterior is not None:
arrays.update(
_uncertainty_arrays_for_split(
model,
frames,
sizes,
posterior=uncertainty_posterior,
aleatoric_variance=uncertainty_aleatoric_variance,
aleatoric_noise_bundle=uncertainty_aleatoric_noise_bundle,
energy_variance_scale=uncertainty_energy_variance_scale,
include_forces=bool(uncertainty_forces),
)
)
filename.parent.mkdir(parents=True, exist_ok=True)
np.savez(filename, **arrays)
print("Saved predictions:", filename.name)
return compute_energy_force_metrics(
true_energies,
true_forces,
sizes,
predicted_energies,
predicted_forces,
)
[docs]
def save_prediction_splits(
model: UFPModel,
dataset: SupervisedAtomsDataset,
*,
output_directory: Path,
prefix: str,
batch_size: int,
device: torch.device | str | None = None,
dtype: torch.dtype | None = None,
progress: bool = False,
uncertainty_posterior: object | None = None,
uncertainty_aleatoric_variance: float | None = None,
uncertainty_aleatoric_noise_bundle: object | None = None,
uncertainty_energy_variance_scale: float = 1.0,
uncertainty_forces: bool = False,
) -> dict[str, dict[str, np.ndarray | float]]:
"""Save train, validation, and holdout prediction arrays."""
splits = {
"training": dataset.training_indices,
"validation": dataset.validation_indices,
"holdout": dataset.holdout_indices,
}
metrics: dict[str, dict[str, np.ndarray | float]] = {}
for split, indices in splits.items():
if len(indices) == 0:
continue
metrics[split] = save_prediction_split(
model,
dataset,
indices,
filename=output_directory / f"{prefix}_{split}_predictions.npz",
batch_size=batch_size,
device=device,
dtype=dtype,
progress=progress,
uncertainty_posterior=uncertainty_posterior,
uncertainty_aleatoric_variance=uncertainty_aleatoric_variance,
uncertainty_aleatoric_noise_bundle=uncertainty_aleatoric_noise_bundle,
uncertainty_energy_variance_scale=uncertainty_energy_variance_scale,
uncertainty_forces=uncertainty_forces,
)
return metrics
[docs]
def print_dataset_summary(dataset: SupervisedAtomsDataset) -> None:
"""Print split sizes and atom-count range for an example dataset."""
print("Frames:", len(dataset.frames))
print("Training structures:", len(dataset.training_indices))
print("Validation structures:", len(dataset.validation_indices))
print("Testing/holdout structures:", len(dataset.holdout_indices))
print(
"Atoms per structure: min =",
dataset.sizes.min(),
"max =",
dataset.sizes.max(),
)
[docs]
def print_metrics(prefix: str, metrics: dict[str, np.ndarray | float]) -> None:
"""Print compact energy and force RMSE metrics."""
print(f"{prefix} energy RMSE: {metrics['rmse_energy_mev_per_atom']:.3f} meV/atom")
print(
f"{prefix} force RMSE: "
f"{metrics['rmse_force_mev_per_angstrom']:.3f} meV/angstrom"
)