Source code for ufp.workflows.threebody

"""Workflow helpers for choosing efficient three-body runtime defaults."""

from __future__ import annotations

from dataclasses import dataclass

import torch

from ufp.terms._threebody_kernels import (
    native_threebody_backend_available,
    native_threebody_dense_feature_cache_available,
    native_threebody_feature_cache_available,
    native_threebody_lstsq_assemble_available,
    native_threebody_preprocess_sources_available,
)
from ufp.terms._threebody_runtime import (
    resolve_threebody_runtime_config,
    set_default_threebody_runtime_env,
    set_threebody_lstsq_runtime_env,
    set_threebody_runtime_env,
)


[docs] @dataclass(frozen=True) class ThreeBodyRuntimeStatus: """Resolved three-body runtime configuration for a training workflow.""" requested_backend: str requested_bucket_backend: str selected_device: torch.device native_cxx_evaluator_available: bool native_cuda_evaluator_available: bool native_cxx_bucketing_available: bool native_cxx_dense_cache_available: bool native_cxx_bucketing_used: bool native_cxx_dense_cache_used: bool native_cxx_dynamic_used: bool native_cuda_dynamic_used: bool dynamic_note: str
[docs] def as_metadata(self) -> dict[str, object]: """Return JSON/checkpoint-friendly runtime metadata.""" return { "threebody_backend": self.requested_backend, "threebody_bucket_backend": self.requested_bucket_backend, "train_device": str(self.selected_device), "native_cxx_evaluator_available": self.native_cxx_evaluator_available, "native_cuda_evaluator_available": self.native_cuda_evaluator_available, "native_cxx_bucketing_available": self.native_cxx_bucketing_available, "native_cxx_dense_cache_available": self.native_cxx_dense_cache_available, "native_cxx_bucketing_used": self.native_cxx_bucketing_used, "native_cxx_dense_cache_used": self.native_cxx_dense_cache_used, "native_cxx_dynamic_used": self.native_cxx_dynamic_used, "native_cuda_dynamic_used": self.native_cuda_dynamic_used, "threebody_dynamic_note": self.dynamic_note, }
[docs] @dataclass(frozen=True) class ThreeBodyLeastSquaresRuntimeStatus: """Resolved three-body runtime configuration for least-squares workflows.""" requested_backend: str selected_backend: str requested_bucket_backend: str fit_device: torch.device prediction_device: torch.device assembled_cache_mode: str normal_equation_cache_mode: str normal_equation_build_device: torch.device | None native_cxx_assembly_available: bool native_cuda_assembly_available: bool native_cxx_bucketing_available: bool native_cxx_assembly_used: bool native_cuda_assembly_used: bool native_cxx_bucketing_used: bool
[docs] def as_metadata(self) -> dict[str, object]: """Return JSON/checkpoint-friendly runtime metadata.""" return { "threebody_lstsq_backend": self.requested_backend, "threebody_lstsq_selected_backend": self.selected_backend, "threebody_bucket_backend": self.requested_bucket_backend, "fit_device": str(self.fit_device), "prediction_device": str(self.prediction_device), "lstsq_cache_mode": self.assembled_cache_mode, "normal_equation_cache_mode": self.normal_equation_cache_mode, "normal_equation_build_device": ( None if self.normal_equation_build_device is None else str(self.normal_equation_build_device) ), "native_cxx_lstsq_available": self.native_cxx_assembly_available, "native_cuda_lstsq_available": self.native_cuda_assembly_available, "native_cxx_bucketing_available": self.native_cxx_bucketing_available, "native_cxx_lstsq_used": self.native_cxx_assembly_used, "native_cuda_lstsq_used": self.native_cuda_assembly_used, "native_cxx_bucketing_used": self.native_cxx_bucketing_used, }
[docs] def configure_threebody_runtime_defaults() -> None: """ Set conservative three-body runtime defaults unless already configured. ``auto`` prefers native dynamic kernels when available. Bucket ``auto`` uses the Python/Torch path for CUDA inputs to avoid CPU-GPU transfer overhead, and may use native CPU source preprocessing for CPU inputs. Explicit ``UFP_THREEBODY_BACKEND`` and ``UFP_THREEBODY_BUCKET_BACKEND`` environment settings are preserved. """ set_default_threebody_runtime_env()
[docs] def preferred_threebody_inference_device() -> torch.device: """ Return the preferred dynamic three-body inference device. The order follows current dynamic-inference benchmarks: native CUDA, native CPU C++, GPU PyTorch, then CPU PyTorch. Returns: Preferred torch device for dynamic three-body inference. """ configure_threebody_runtime_defaults() if torch.cuda.is_available() and native_threebody_backend_available(device="cuda"): return torch.device("cuda") if native_threebody_backend_available(device="cpu"): return torch.device("cpu") if torch.cuda.is_available(): return torch.device("cuda") return torch.device("cpu")
def _resolve_device_option( *, selection: str | torch.device, explicit: str | torch.device | None, ) -> torch.device: """Resolve an explicit device or an auto/cpu/cuda selection.""" if explicit is not None: return torch.device(explicit) normalized = str(selection).strip().lower() if normalized == "auto": return preferred_threebody_inference_device() if normalized == "gpu": normalized = "cuda" return torch.device(normalized)
[docs] def configure_threebody_leastsquares_runtime( *, backend: str = "auto", bucket_backend: str = "python", fit_device_selection: str | torch.device = "auto", fit_device: str | torch.device | None = None, prediction_device_selection: str | torch.device = "auto", prediction_device: str | torch.device | None = None, assembled_cache_mode: str = "auto", normal_equation_cache_mode: str = "auto", normal_equation_build_device: str | torch.device | None = None, ) -> tuple[torch.device, torch.device, ThreeBodyLeastSquaresRuntimeStatus]: """ Apply three-body least-squares runtime options and report dispatch status. Least-squares ``auto`` follows the current assembly benchmarks: native CUDA, native CPU C++, CUDA Torch, then CPU Torch. Native source bucketing is not part of the default least-squares path because the benchmarked workloads do not justify its overhead. Args: backend: Requested least-squares assembly backend. bucket_backend: Requested source-bucketing backend. fit_device_selection: Automatic or explicit fit-device selection. fit_device: Explicit fit device override. prediction_device_selection: Automatic or explicit prediction-device selection. prediction_device: Explicit prediction device override. assembled_cache_mode: Assembled-batch cache mode label. normal_equation_cache_mode: Normal-equation cache mode label. normal_equation_build_device: Optional device used to build normal equations. Returns: Fit device, prediction device, and selected runtime status. """ runtime_config = resolve_threebody_runtime_config( lstsq_backend=backend, bucket_backend=bucket_backend, ) backend_normalized = runtime_config.lstsq_backend bucket_backend_normalized = runtime_config.bucket_backend set_threebody_lstsq_runtime_env( backend=backend_normalized, bucket_backend=bucket_backend_normalized, ) configure_threebody_runtime_defaults() native_cxx_assembly_available = native_threebody_feature_cache_available( device="cpu" ) and native_threebody_dense_feature_cache_available(device="cpu") native_cuda_assembly_available = ( torch.cuda.is_available() and native_threebody_lstsq_assemble_available(device="cuda") ) native_cxx_bucketing_available = native_threebody_preprocess_sources_available( device="cpu" ) if fit_device is None and str(fit_device_selection).strip().lower() == "auto": if native_cuda_assembly_available: resolved_fit_device = torch.device("cuda") elif native_cxx_assembly_available: resolved_fit_device = torch.device("cpu") elif torch.cuda.is_available(): resolved_fit_device = torch.device("cuda") else: resolved_fit_device = torch.device("cpu") else: resolved_fit_device = _resolve_device_option( selection=fit_device_selection, explicit=fit_device, ) resolved_prediction_device = _resolve_device_option( selection=prediction_device_selection, explicit=prediction_device, ) resolved_normal_device = ( None if normal_equation_build_device is None else torch.device(normal_equation_build_device) ) build_device = ( resolved_fit_device if resolved_normal_device is None else resolved_normal_device ) if backend_normalized == "torch": selected_backend = "torch" elif backend_normalized == "cuda": selected_backend = "cuda" if native_cuda_assembly_available else "unavailable" elif build_device.type == "cuda" and native_cuda_assembly_available: selected_backend = "cuda" elif native_cxx_assembly_available: selected_backend = "native" else: selected_backend = "torch" if backend_normalized == "auto" else "unavailable" return ( resolved_fit_device, resolved_prediction_device, ( ThreeBodyLeastSquaresRuntimeStatus( requested_backend=backend_normalized, selected_backend=selected_backend, requested_bucket_backend=bucket_backend_normalized, fit_device=resolved_fit_device, prediction_device=resolved_prediction_device, assembled_cache_mode=str(assembled_cache_mode), normal_equation_cache_mode=str(normal_equation_cache_mode), normal_equation_build_device=resolved_normal_device, native_cxx_assembly_available=native_cxx_assembly_available, native_cuda_assembly_available=native_cuda_assembly_available, native_cxx_bucketing_available=native_cxx_bucketing_available, native_cxx_assembly_used=selected_backend == "native", native_cuda_assembly_used=selected_backend == "cuda", native_cxx_bucketing_used=( ( bucket_backend_normalized == "native" or ( bucket_backend_normalized == "auto" and build_device.type == "cpu" ) ) and native_cxx_bucketing_available ), ) ), )
[docs] def configure_threebody_training_runtime( *, backend: str = "auto", bucket_backend: str = "auto", device_selection: str | torch.device = "auto", device: str | torch.device | None = None, cache_batches: bool = False, cache_batches_on_device: bool = False, feature_cache_storage: str = "none", ) -> tuple[torch.device, ThreeBodyRuntimeStatus]: """ Apply three-body runtime options and return the selected training status. Native dynamic three-body kernels are inference-only today, so gradient training always uses the PyTorch evaluator. The native CPU bucket path can still be used when its inputs remain on CPU. Args: backend: Requested dynamic three-body evaluator backend. bucket_backend: Requested source-bucketing backend. device_selection: Automatic or explicit training-device selection. device: Explicit training device override. cache_batches: Whether training will reuse cached batches. cache_batches_on_device: Whether cached batches stay on the training device. feature_cache_storage: Feature-cache storage policy. Returns: Training device and selected runtime status. """ runtime_config = resolve_threebody_runtime_config( dynamic_backend=backend, bucket_backend=bucket_backend, ) backend_normalized = runtime_config.dynamic_backend bucket_backend_normalized = runtime_config.bucket_backend set_threebody_runtime_env( backend="torch", bucket_backend=bucket_backend_normalized, ) configure_threebody_runtime_defaults() if device is not None: selected_device = torch.device(device) else: device_selection_normalized = str(device_selection).strip().lower() if device_selection_normalized == "auto": selected_device = preferred_threebody_inference_device() else: if device_selection_normalized == "gpu": device_selection_normalized = "cuda" selected_device = torch.device(device_selection_normalized) native_cxx_evaluator_available = native_threebody_backend_available(device="cpu") native_cuda_evaluator_available = ( torch.cuda.is_available() and native_threebody_backend_available(device="cuda") ) native_cxx_bucketing_available = native_threebody_preprocess_sources_available( device="cpu" ) native_cxx_dense_cache_available = native_threebody_feature_cache_available( device="cpu" ) and native_threebody_dense_feature_cache_available(device="cpu") native_cxx_bucketing_used = ( bucket_backend_normalized == "native" or (bucket_backend_normalized == "auto" and selected_device.type == "cpu") ) and native_cxx_bucketing_available native_cxx_dense_cache_used = False dynamic_note = "UFP_THREEBODY_BACKEND=torch" if backend_normalized != "torch": dynamic_note = ( "training requires autograd; native dynamic kernels are inference-only" ) return selected_device, ThreeBodyRuntimeStatus( requested_backend=backend_normalized, requested_bucket_backend=bucket_backend_normalized, selected_device=selected_device, native_cxx_evaluator_available=native_cxx_evaluator_available, native_cuda_evaluator_available=native_cuda_evaluator_available, native_cxx_bucketing_available=native_cxx_bucketing_available, native_cxx_dense_cache_available=native_cxx_dense_cache_available, native_cxx_bucketing_used=native_cxx_bucketing_used, native_cxx_dense_cache_used=native_cxx_dense_cache_used, native_cxx_dynamic_used=False, native_cuda_dynamic_used=False, dynamic_note=dynamic_note, )
__all__ = [ "ThreeBodyLeastSquaresRuntimeStatus", "ThreeBodyRuntimeStatus", "configure_threebody_leastsquares_runtime", "configure_threebody_training_runtime", "configure_threebody_runtime_defaults", "preferred_threebody_inference_device", "print_threebody_leastsquares_runtime_status", "print_threebody_runtime_status", ]