"""
integrals ā Molecular integral container and PySCF computation
==============================================================
Provides the ``MolecularIntegrals`` frozen dataclass that holds one- and
two-electron integrals together with molecule metadata, and the
``compute_molecular_integrals`` helper that runs RHF via PySCF and
returns a populated ``MolecularIntegrals`` instance.
"""
from __future__ import annotations
import logging
import os
import shutil
from collections.abc import Callable
from dataclasses import dataclass
import numpy as np
logger = logging.getLogger(__name__)
__all__ = [
"MATRIX_ELEMENT_TOL",
"MolecularIntegrals",
"cached_compute_molecular_integrals",
"clear_integral_cache",
"compute_molecular_integrals",
"get_integral_cache",
]
MATRIX_ELEMENT_TOL: float = 1e-12
"""float : Absolute tolerance below which matrix elements are treated as zero."""
# ---------------------------------------------------------------------------
# MolecularIntegrals dataclass
# ---------------------------------------------------------------------------
[docs]
@dataclass(frozen=True)
class MolecularIntegrals:
"""Container for molecular one- and two-electron integrals.
All arrays use the *spatial-orbital* indexing convention produced by
PySCF's ``ao2mo`` module.
Parameters
----------
h1e : np.ndarray
One-electron integrals, shape ``(n_orb, n_orb)``, dtype ``float64``.
h2e : np.ndarray
Two-electron integrals in chemist's notation ``(pq|rs)``,
shape ``(n_orb, n_orb, n_orb, n_orb)``, dtype ``float64``.
nuclear_repulsion : float
Nuclear repulsion energy in Hartree.
n_electrons : int
Total number of electrons.
n_orbitals : int
Number of spatial orbitals.
n_alpha : int
Number of alpha (spin-up) electrons.
n_beta : int
Number of beta (spin-down) electrons.
Raises
------
ValueError
If array shapes are inconsistent with ``n_orbitals`` or if
``n_alpha + n_beta != n_electrons``.
Examples
--------
>>> import numpy as np
>>> h1 = np.zeros((2, 2))
>>> h2 = np.zeros((2, 2, 2, 2))
>>> mi = MolecularIntegrals(h1, h2, 0.7, 2, 2, 1, 1)
>>> mi.n_orbitals
2
"""
h1e: np.ndarray
h2e: np.ndarray
nuclear_repulsion: float
n_electrons: int
n_orbitals: int
n_alpha: int
n_beta: int
def __post_init__(self) -> None:
"""Validate shapes and dtypes."""
n = self.n_orbitals
if self.h1e.shape != (n, n):
raise ValueError(
f"h1e shape {self.h1e.shape} does not match n_orbitals={n}"
)
if self.h2e.shape != (n, n, n, n):
raise ValueError(
f"h2e shape {self.h2e.shape} does not match n_orbitals={n}"
)
if self.n_alpha + self.n_beta != self.n_electrons:
raise ValueError(
f"n_alpha ({self.n_alpha}) + n_beta ({self.n_beta}) "
f"!= n_electrons ({self.n_electrons})"
)
# ---------------------------------------------------------------------------
# compute_molecular_integrals (PySCF)
# ---------------------------------------------------------------------------
_FCI_CONFIG_LIMIT: int = 50_000_000
"""int : Auto-CASCI threshold ā skip CASSCF orbital optimisation when the
active-space determinant count exceeds this limit."""
[docs]
def compute_molecular_integrals(
geometry: list[tuple[str, tuple[float, float, float]]],
basis: str = "sto-3g",
charge: int = 0,
spin: int = 0,
cas: tuple[int, int] | None = None,
casci: bool = False,
) -> MolecularIntegrals:
"""Run RHF (+ optional CASSCF/CASCI) and extract molecular integrals.
Parameters
----------
geometry : list of (str, (float, float, float))
Molecular geometry. Each element is ``(atom_symbol, (x, y, z))``
with coordinates in **Angstroms**.
basis : str, optional
Gaussian basis set name (default ``"sto-3g"``).
charge : int, optional
Net charge of the molecule (default ``0``).
spin : int, optional
Spin multiplicity minus one, i.e. ``2S`` (default ``0`` for
singlet).
cas : tuple of (int, int) or None, optional
``(nelecas, ncas)`` for CAS active-space reduction. When
provided, runs CASSCF (or CASCI if *casci* is ``True``) after
RHF and returns integrals in the active space only
(``n_orbitals = ncas``). ``None`` (default) uses the full MO
space.
casci : bool, optional
If ``True`` and *cas* is not ``None``, use CASCI instead of
CASSCF. CASCI uses HF MOs directly (no orbital optimisation),
which is faster for large active spaces where CASSCF's
iterative FCI solver would be infeasible. Also auto-enabled
when ``ncas >= 15`` or when the determinant count exceeds
``_FCI_CONFIG_LIMIT``.
Returns
-------
MolecularIntegrals
Integrals and metadata needed by :class:`MolecularHamiltonian`.
When *cas* is used, ``nuclear_repulsion`` is the frozen-core
energy ``e_core`` (includes frozen-electron energy + nuclear
repulsion), and ``n_orbitals`` equals ``ncas``.
Raises
------
ImportError
If PySCF is not installed.
Warns
-----
UserWarning
If the RHF or CASSCF calculation does not converge (proceeds
with unconverged orbitals).
Examples
--------
>>> geometry = [("H", (0.0, 0.0, 0.0)), ("H", (0.0, 0.0, 0.74))]
>>> mi = compute_molecular_integrals(geometry, basis="sto-3g") # doctest: +SKIP
>>> mi.n_orbitals # doctest: +SKIP
2
CAS example (Nā with 10 active electrons in 8 orbitals):
>>> n2 = [("N", (0, 0, 0)), ("N", (0, 0, 1.1))]
>>> mi = compute_molecular_integrals(n2, "cc-pvdz", cas=(10, 8)) # doctest: +SKIP
>>> mi.n_orbitals # doctest: +SKIP
8
"""
try:
import pyscf # noqa: F401
from pyscf import ao2mo, gto, scf
except ImportError as exc:
raise ImportError(
"PySCF is required for compute_molecular_integrals. "
"Install it with: pip install pyscf"
) from exc
# Build molecule
mol = gto.Mole()
mol.atom = [(atom, coord) for atom, coord in geometry]
mol.basis = basis
mol.charge = charge
mol.spin = spin
mol.unit = "Angstrom"
mol.build()
# Run RHF
mf = scf.RHF(mol)
mf.kernel()
if not mf.converged:
import warnings
warnings.warn(
"RHF did not converge. Proceeding with unconverged orbitals.",
stacklevel=2,
)
# --- CAS active-space path ---
if cas is not None:
return _compute_cas_integrals(mol, mf, cas, casci, spin)
# --- Full-space path (original behaviour) ---
n_orb = mf.mo_coeff.shape[1]
h1e = mf.mo_coeff.T @ mf.get_hcore() @ mf.mo_coeff
h1e = np.asarray(h1e, dtype=np.float64)
eri_mo = ao2mo.full(mol, mf.mo_coeff)
h2e = ao2mo.restore(1, eri_mo, n_orb).astype(np.float64)
n_electrons = mol.nelectron
n_alpha = (n_electrons + spin) // 2
n_beta = (n_electrons - spin) // 2
return MolecularIntegrals(
h1e=h1e,
h2e=h2e,
nuclear_repulsion=float(mol.energy_nuc()),
n_electrons=n_electrons,
n_orbitals=n_orb,
n_alpha=n_alpha,
n_beta=n_beta,
)
def _compute_cas_integrals(
mol: object,
mf: object,
cas: tuple[int, int],
casci: bool,
spin: int,
) -> MolecularIntegrals:
"""Extract active-space integrals via CASSCF or CASCI.
Parameters
----------
mol : pyscf.gto.Mole
Built PySCF molecule.
mf : pyscf.scf.RHF
Converged (or attempted) RHF object.
cas : tuple of (int, int)
``(nelecas, ncas)`` active space specification.
casci : bool
Force CASCI (no orbital optimisation).
spin : int
``2S`` spin value.
Returns
-------
MolecularIntegrals
Active-space integrals with ``nuclear_repulsion = e_core``.
Raises
------
ValueError
If active electron counts are negative or exceed ``ncas``.
"""
from math import comb as _comb
from pyscf import ao2mo, fci, mcscf
nelecas, ncas = cas
n_elec_cas = nelecas
n_alpha_cas = (nelecas + spin) // 2
n_beta_cas = (nelecas - spin) // 2
# Validate active electron / orbital counts
if n_alpha_cas < 0 or n_beta_cas < 0:
raise ValueError(
f"Invalid CAS electron counts: n_alpha={n_alpha_cas}, "
f"n_beta={n_beta_cas} (from nelecas={nelecas}, spin={spin})"
)
if n_alpha_cas > ncas or n_beta_cas > ncas:
raise ValueError(
f"CAS electrons exceed orbitals: n_alpha={n_alpha_cas}, "
f"n_beta={n_beta_cas}, ncas={ncas}"
)
# Estimate determinant count for auto-CASCI decision
n_configs = _comb(ncas, n_alpha_cas) * _comb(ncas, n_beta_cas)
use_casci = casci or ncas >= 15 or n_configs > _FCI_CONFIG_LIMIT
if use_casci:
mc = mcscf.CASCI(mf, ncas=ncas, nelecas=nelecas)
else:
mc = mcscf.CASSCF(mf, ncas=ncas, nelecas=nelecas)
# Linear molecules need non-symmetry FCI solver
if hasattr(mol, "symmetry") and mol.symmetry:
topgroup = getattr(mol, "topgroup", "")
if topgroup in ("Dooh", "Coov"):
mc.fcisolver = fci.direct_spin1.FCISolver(mol)
# Skip kernel for infeasibly large CAS (integrals-only mode)
if use_casci and n_configs > _FCI_CONFIG_LIMIT:
logger.info(
"Skipping FCI solve for CAS(%s,%d): %s configs > %s limit. "
"Extracting integrals only.",
nelecas,
ncas,
f"{n_configs:,}",
f"{_FCI_CONFIG_LIMIT:,}",
)
else:
mc.kernel()
if not use_casci and not mc.converged:
import warnings
warnings.warn(
f"CASSCF did not converge for CAS({nelecas},{ncas}). "
"Integrals may be unreliable.",
stacklevel=3,
)
# Extract active-space integrals
h1e_cas, e_core = mc.h1e_for_cas()
active_mo = mc.mo_coeff[:, mc.ncore : mc.ncore + mc.ncas]
h2e_cas = ao2mo.full(mol, active_mo)
h2e_cas = ao2mo.restore(1, h2e_cas, ncas)
h1e_cas = np.asarray(h1e_cas, dtype=np.float64)
h2e_cas = np.asarray(h2e_cas, dtype=np.float64)
return MolecularIntegrals(
h1e=h1e_cas,
h2e=h2e_cas,
nuclear_repulsion=float(e_core),
n_electrons=n_elec_cas,
n_orbitals=ncas,
n_alpha=n_alpha_cas,
n_beta=n_beta_cas,
)
# ---------------------------------------------------------------------------
# Persistent cache via joblib
# ---------------------------------------------------------------------------
_DEFAULT_CACHE_DIR = os.path.join(
os.environ.get("QVARTOOLS_CACHE_DIR", os.path.expanduser("~/.cache/qvartools")),
"integrals",
)
def get_integral_cache(
cache_dir: str | None = None,
) -> Callable[..., MolecularIntegrals]:
"""Return a cached version of :func:`compute_molecular_integrals`.
Uses ``joblib.Memory`` for transparent disk-based caching of PySCF
integral computations. Repeated calls with the same arguments
return instantly from disk.
Parameters
----------
cache_dir : str or None, optional
Directory for cached results. Defaults to
``~/.cache/qvartools/integrals`` (overridable via
``QVARTOOLS_CACHE_DIR`` environment variable).
Returns
-------
callable
A cached version of ``compute_molecular_integrals`` with the
same signature.
"""
try:
from joblib import Memory
except ImportError as exc:
raise ImportError(
"joblib is required for integral caching. "
"Install it with: pip install joblib"
) from exc
location = cache_dir if cache_dir is not None else _DEFAULT_CACHE_DIR
memory = Memory(location, verbose=0)
_joblib_cached = memory.cache(compute_molecular_integrals)
logger.info("Integral cache enabled at %s", location)
def _cas_aware_cached(
geometry: list[tuple[str, tuple[float, float, float]]],
basis: str = "sto-3g",
charge: int = 0,
spin: int = 0,
cas: tuple[int, int] | None = None,
casci: bool = False,
) -> MolecularIntegrals:
if cas is not None:
return compute_molecular_integrals(
geometry, basis=basis, charge=charge, spin=spin, cas=cas, casci=casci
)
return _joblib_cached(geometry, basis=basis, charge=charge, spin=spin)
return _cas_aware_cached
# Module-level default cached function (lazy init)
_default_cached_fn: Callable[..., MolecularIntegrals] | None = None
def cached_compute_molecular_integrals(
geometry: list[tuple[str, tuple[float, float, float]]],
basis: str = "sto-3g",
charge: int = 0,
spin: int = 0,
cas: tuple[int, int] | None = None,
casci: bool = False,
) -> MolecularIntegrals:
"""Cached version of :func:`compute_molecular_integrals`.
Identical interface, but results are persisted to disk via
``joblib.Memory``. The default cache directory is
``~/.cache/qvartools/integrals``.
CAS integrals are **not cached** because CASSCF orbital
optimisation is non-deterministic.
Parameters
----------
geometry : list of (str, (float, float, float))
Molecular geometry.
basis : str, optional
Basis set name (default ``"sto-3g"``).
charge : int, optional
Net charge (default ``0``).
spin : int, optional
2S (default ``0``).
cas : tuple of (int, int) or None, optional
``(nelecas, ncas)`` for CAS. Bypasses cache when not ``None``.
casci : bool, optional
Force CASCI (default ``False``).
Returns
-------
MolecularIntegrals
Cached or freshly computed integrals.
"""
# CAS integrals bypass cache (CASSCF is non-deterministic)
if cas is not None:
return compute_molecular_integrals(
geometry, basis=basis, charge=charge, spin=spin, cas=cas, casci=casci
)
global _default_cached_fn # noqa: PLW0603
if _default_cached_fn is None:
_default_cached_fn = get_integral_cache()
return _default_cached_fn(geometry, basis=basis, charge=charge, spin=spin)
def clear_integral_cache(cache_dir: str | None = None) -> None:
"""Remove all cached integral data.
Parameters
----------
cache_dir : str or None, optional
Cache directory to clear. Defaults to the same directory
used by :func:`get_integral_cache`.
"""
location = cache_dir if cache_dir is not None else _DEFAULT_CACHE_DIR
# Safety: refuse to delete directories that don't look like a cache
if "qvartools" not in location and "cache" not in location.lower():
raise ValueError(
f"Refusing to delete '{location}' ā path does not contain "
f"'qvartools' or 'cache'. Pass an explicit cache directory."
)
if os.path.isdir(location):
shutil.rmtree(location)
logger.info("Integral cache cleared at %s", location)