Skip to content

Commit

Permalink
parallelize some stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
tjira committed Sep 8, 2024
1 parent d858638 commit f702ff0
Show file tree
Hide file tree
Showing 12 changed files with 74 additions and 91 deletions.
21 changes: 0 additions & 21 deletions education/python/resmet.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
parser.add_argument("--ccd", help="Perform the doubles coupled clusters calculation.", action=ap.BooleanOptionalAction)
parser.add_argument("--ccsd", help="Perform the singles/doubles coupled clusters calculation.", action=ap.BooleanOptionalAction)
parser.add_argument("--ccsd-t", help="Perform the singles/doubles coupled clusters calculation with third order excitations included using perturbation theory.", action=ap.BooleanOptionalAction)
parser.add_argument("--lccd", help="Perform the linearized doubles coupled clusters calculation.", action=ap.BooleanOptionalAction)
parser.add_argument("--mp2", help="Perform the second order Moller-Plesset calculation.", action=ap.BooleanOptionalAction)
parser.add_argument("--mp3", help="Perform the third order Moller-Plesset calculation.", action=ap.BooleanOptionalAction)

Expand Down Expand Up @@ -155,26 +154,6 @@
# initialize the first guess for the t-amplitudes
t1, t2 = np.zeros((2 * nvirt, 2 * nocc)), Jmsa[v, v, o, o] * Emsd

# LCCD loop
if args.lccd:
while abs(E_LCCD - E_LCCD_P) > args.threshold:
# collect all the distinct terms
lccd1 = 0.5 * np.einsum("abcd,cdij->abij", Jmsa[v, v, v, v], t2, optimize=True)
lccd2 = 0.5 * np.einsum("klij,abkl->abij", Jmsa[o, o, o, o], t2, optimize=True)
lccd3 = np.einsum("akic,bcjk->abij", Jmsa[v, o, o, v], t2, optimize=True)

# apply the permuation operator and add it to the corresponding term
lccd3 = lccd3 + lccd3.transpose(1, 0, 3, 2) - lccd3.transpose(1, 0, 2, 3) - lccd3.transpose(0, 1, 3, 2)

# update the t-amplitudes
t2 = Emsd * (Jmsa[v, v, o, o] + lccd1 + lccd2 + lccd3)

# evaluate the energy
E_LCCD_P, E_LCCD = E_LCCD, (1 / 4) * np.einsum("ijab,abij", Jmsa[o, o, v, v], t2, optimize=True)

# print the LCCD energy
print(" LCCD ENERGY: {:.8f}".format(E_HF + E_LCCD + VNN))

# CCD loop
if args.ccd:
while abs(E_CCD - E_CCD_P) > args.threshold:
Expand Down
2 changes: 1 addition & 1 deletion include/hartreefock.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class HartreeFock {
double get_energy(const torch::Tensor& H_AO, const torch::Tensor& F_AO, const torch::Tensor& D_MO) const;
torch::Tensor get_fock(const torch::Tensor& H_AO, const torch::Tensor& J_AO, const torch::Tensor& D_MO) const;
std::tuple<torch::Tensor, torch::Tensor, double> run(const System& system, const torch::Tensor& H_AO, const torch::Tensor& S_AO, const torch::Tensor& J_AO, torch::Tensor D_MO) const;
std::tuple<torch::Tensor, torch::Tensor, double> scf(const System& system, const torch::Tensor& H_AO, const torch::Tensor& S_AO, const torch::Tensor& J_AO, torch::Tensor D_MO) const;
std::tuple<torch::Tensor, torch::Tensor, double> scf(const System& system, const torch::Tensor& H, const torch::Tensor& S, const torch::Tensor& J, torch::Tensor D) const;

private:
Input::HartreeFock input;
Expand Down
2 changes: 1 addition & 1 deletion include/input.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ struct Input {
} classical_dynamics;
};

extern nlohmann::json default_input;
extern nlohmann::json default_input; extern int nthread;

NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Input::System, basis, path, charge, multiplicity);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Input::Wavefunction, dimension, mass, momentum, grid_limits, grid_points, guess);
Expand Down
2 changes: 1 addition & 1 deletion include/integral.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class Integral {
public:
Integral(const Input::Integral& input);
Integral(const Input::Integral& input) : precision(input.precision) {}

static torch::Tensor double_electron(libint2::Engine& engine, const libint2::BasisSet& shells);
static torch::Tensor single_electron(libint2::Engine& engine, const libint2::BasisSet& shells);
Expand Down
7 changes: 2 additions & 5 deletions script/libfftw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ mkdir -p external && wget -O external/libfftw.tar.gz https://www.fftw.org/fftw-3
cd external && rm -rf libfftw && tar -xzvf libfftw.tar.gz && mv fftw* libfftw && cd -

# configure FFTW
cd external/libfftw && cmake -B build -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$PWD/install" && cd -
cd external/libfftw && cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX="$PWD/install" && cd -

# compile and install FFTW
cd external/libfftw && cmake --build build --parallel 2 && cmake --install build && cd -
Expand All @@ -18,8 +18,5 @@ cp -r external/libfftw/install/include external/libfftw/install/lib external/
# remove redundant files
rm -rf external/libfftw external/include/*.f external/include/*.f03 external/lib/cmake external/lib/pkgconfig

# rename the shared library
cd external/lib && [[ -f libfftw3.so.3 ]] && mv libfftw3.so.3 libfftw.so && patchelf --set-soname libfftw.so libfftw.so && rm libfftw3.* ; cd -

# rename the static library
cd external/lib && [[ -f libfftw3.a ]] && mv libfftw3.a libfftw.a ; cd -
cd external/lib && mv libfftw3.a libfftw.a ; cd -
19 changes: 5 additions & 14 deletions script/libint.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,16 @@
#!/bin/bash

# download libint
mkdir -p external && wget -O external/libint.tgz https://github.com/evaleev/libint/releases/download/v2.9.0/libint-2.9.0-mpqc4.tgz
mkdir -p external && git clone https://github.com/evaleev/libint.git external/libint

# unpack libint
cd external && rm -rf libint && tar -xvf libint.tgz --warning=no-unknown-keyword && mv libint-* libint && cd -

# configure libint
cd external/libint && cmake -B build -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$PWD/install" && cd -

# compile and install libint
cd external/libint && cmake --build build --parallel 2 && cmake --install build && cd -
# configure, compile and install libint
cd external/libint && ./autogen.sh && ./configure CXX=g++ CXXFLAGS="-march=native -s -O3 -flto=auto" --prefix="$PWD/install" --enable-1body=1 --enable-eri=1 --with-max-am=6 && make -j2 && make install && cd -

# copy the compiled library
cp -r external/libint/install/include external/libint/install/lib external/

# remove redundant files
rm -rf external/libint external/lib/cmake external/lib/pkgconfig

# rename the shared library
cd external/lib && [[ -f libint2.so ]] && mv libint2.so.2.9.0 libint.so && patchelf --set-soname libint.so libint.so && rm libint2.* ; cd -
rm -rf external/libint external/lib/*.la external/lib/pkgconfig

# rename the library
cd external/lib && [[ -f libint2.a ]] && mv libint2.a libint.a ; cd -
cd external/lib && mv libint2.a libint.a ; cd -
1 change: 1 addition & 0 deletions src/configurationinteraction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ std::tuple<torch::Tensor, torch::Tensor> ConfigurationInteraction::run(const Sys
Timepoint hamiltonian_timer = Timer::Now();

// fill the CI Hamiltonian
#pragma omp parallel for num_threads(nthread)
for (size_t i = 0; i < dets.size(); i++) {
for (size_t j = i; j < dets.size(); j++) {

Expand Down
22 changes: 11 additions & 11 deletions src/hartreefock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::run(const System&
return input.generalized ? scf(system, H_AS, S_AS, J_AS, D_MS) : scf(system, H_AO, S_AO, J_AO, D_MO);
}

std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System& system, const torch::Tensor& H_AO, const torch::Tensor& S_AO, const torch::Tensor& J_AO, torch::Tensor D_MO) const {
std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System& system, const torch::Tensor& H, const torch::Tensor& S, const torch::Tensor& J, torch::Tensor D) const {
// define the Fock matrix, orbital coefficients, electron energy and containers for errors and Fock matrices
torch::Tensor F_AO = H_AO, C_MO = torch::zeros_like(D_MO); double energy_hf = 0; std::vector<torch::Tensor> errors, focks;
torch::Tensor F = H, C = torch::zeros_like(D); double energy_hf = 0; std::vector<torch::Tensor> errors, focks;

// calculate the complementary X matrix as square root of the inverse of the overlap matrix
auto [SVAL, SVEC] = torch::linalg::eigh(S_AO, "L"); torch::Tensor X = SVEC.mm(torch::diag(1 / torch::sqrt(SVAL))).mm(SVEC.swapaxes(0, 1));
auto [SVAL, SVEC] = torch::linalg::eigh(S, "L"); torch::Tensor X = SVEC.mm(torch::diag(1 / torch::sqrt(SVAL))).mm(SVEC.swapaxes(0, 1));

// print the header
std::printf("%s HF ENERGY CALCULATION\n%6s %20s %8s %8s %12s\n", input.generalized ? "GENERALIZED" : "RESTRICTED", "ITER", "ENERGY", "|dE|", "|dD|", "TIMER");
Expand All @@ -54,10 +54,10 @@ std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System&
Timepoint iteration_timer = Timer::Now();

// calculate the Fock matrix and save the previous values
F_AO = get_fock(H_AO, J_AO, D_MO); torch::Tensor D_MO_P = D_MO; double energy_hf_prev = energy_hf;
F = get_fock(H, J, D); torch::Tensor D_P = D; double energy_hf_prev = energy_hf;

// calculate the error vector and append it to the container along with the Fock matrix
torch::Tensor error = S_AO.mm(D_MO).mm(F_AO) - F_AO.mm(D_MO).mm(S_AO); if (i) errors.push_back(error), focks.push_back(F_AO);
torch::Tensor error = S.mm(D).mm(F) - F.mm(D).mm(S); if (i) errors.push_back(error), focks.push_back(F);

// truncate the error and Fock vector containers if they exceed the DIIS size
if (i > input.diis_size) errors.erase(errors.begin()), focks.erase(focks.begin());
Expand All @@ -79,30 +79,30 @@ std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System&
torch::Tensor c = torch::linalg::solve(B, b, true);

// extrapolate the Fock matrix
F_AO = c.index({0}) * focks.at(0); for (int j = 1; j < input.diis_size; j++) F_AO += c.index({j}) * focks.at(j);
F = c.index({0}) * focks.at(0); for (int j = 1; j < input.diis_size; j++) F += c.index({j}) * focks.at(j);
}

// solve the Roothaan equations
torch::Tensor E_MO; std::tie(E_MO, C_MO) = torch::linalg::eigh(X.mm(F_AO).mm(X), "L"); C_MO = X.mm(C_MO);
torch::Tensor E; std::tie(E, C) = torch::linalg::eigh(X.mm(F).mm(X), "L"); C = X.mm(C);

// calculate the new density and energy
D_MO = get_density(system, C_MO), energy_hf = get_energy(H_AO, F_AO, D_MO);
D = get_density(system, C), energy_hf = get_energy(H, F, D);

// calculate the errors
double error_energy = std::abs(energy_hf - energy_hf_prev), error_density = (D_MO - D_MO_P).norm().item<double>();
double error_energy = std::abs(energy_hf - energy_hf_prev), error_density = (D - D_P).norm().item<double>();

// print the iteration info
std::printf("%6d %20.14f %.2e %.2e %s %s\n", i + 1, energy_hf, error_energy, error_density, Timer::Format(Timer::Elapsed(iteration_timer)).c_str(), input.diis_size && i >= input.diis_size ? "DIIS" : "");

// finish if covergence reached
if (std::abs(energy_hf - energy_hf_prev) < input.threshold && (D_MO - D_MO_P).norm().item<double>() < input.threshold) break;
if (std::abs(energy_hf - energy_hf_prev) < input.threshold && (D - D_P).norm().item<double>() < input.threshold) break;

// throw an exception if the maximum number of iterations reached
if (i == input.max_iter - 1) throw std::runtime_error("MAXIMUM NUMBER OF ITERATIONS IN THE SCF REACHED");
}

// transform the results to the spin basis
torch::Tensor C_MS = input.generalized ? C_MO : Transform::CoefficientSpin(C_MO); torch::Tensor F_MS = input.generalized ? Transform::SingleSpatial(F_AO, C_MS) : Transform::SingleSpin(F_AO, C_MS);
torch::Tensor C_MS = input.generalized ? C : Transform::CoefficientSpin(C); torch::Tensor F_MS = input.generalized ? Transform::SingleSpatial(F, C_MS) : Transform::SingleSpin(F, C_MS);

// return the converged results
return {F_MS, C_MS, energy_hf};
Expand Down
2 changes: 2 additions & 0 deletions src/input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,5 @@ nlohmann::json default_input = R"(
}
}
)"_json;

int nthread = 1;
70 changes: 34 additions & 36 deletions src/integral.cpp
Original file line number Diff line number Diff line change
@@ -1,36 +1,32 @@
#include "integral.h"

Integral::Integral(const Input::Integral& input) : precision(input.precision) {}

torch::Tensor Integral::double_electron(libint2::Engine& engine, const libint2::BasisSet& shells) {
// define the number of basis functions, matrix of integrals and shell to basis function map
int nbf = shells.nbf(); std::vector<double> ints(nbf * nbf * nbf * nbf, 0); std::vector<size_t> sh2bf = shells.shell2bf();

// generate engine for every thread
std::vector<libint2::Engine> engines(nthread, engine);

// loop over all unique elements
for (size_t i = 0; i < shells.size(); i++) {
for (size_t j = i; j < shells.size(); j++) {
for (size_t k = i; k < shells.size(); k++) {
for (size_t l = 0; l < shells.size(); l++) {

// calculate the integral and skip if it is zero
int integral_index = 0; engine.compute(shells.at(i), shells.at(j), shells.at(k), shells.at(l)); if (engine.results().at(0) == nullptr) continue;

// assign the integrals
for (size_t m = 0; m < shells.at(i).size(); m++) {
for (size_t n = 0; n < shells.at(j).size(); n++) {
for (size_t o = 0; o < shells.at(k).size(); o++) {
for (size_t p = 0; p < shells.at(l).size(); p++) {
ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engine.results().at(0)[integral_index ];
ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engine.results().at(0)[integral_index ];
ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engine.results().at(0)[integral_index ];
ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engine.results().at(0)[integral_index ];
ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engine.results().at(0)[integral_index ];
ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engine.results().at(0)[integral_index ];
ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engine.results().at(0)[integral_index ];
ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engine.results().at(0)[integral_index++];
}
}
}
#pragma omp parallel for num_threads(nthread)
for (size_t i = 0; i < shells.size(); i++) for (size_t j = i; j < shells.size(); j++) for (size_t k = i; k < shells.size(); k++) for (size_t l = (i == k ? j : k); l < shells.size(); l++) {

// calculate the integral and skip if it is zero
int integral_index = 0; engines.at(omp_get_thread_num()).compute(shells.at(i), shells.at(j), shells.at(k), shells.at(l)); if (engines.at(omp_get_thread_num()).results().at(0) == nullptr) continue;

// assign the integrals
for (size_t m = 0; m < shells.at(i).size(); m++) {
for (size_t n = 0; n < shells.at(j).size(); n++) {
for (size_t o = 0; o < shells.at(k).size(); o++) {
for (size_t p = 0; p < shells.at(l).size(); p++) {
ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index++];
}
}
}
Expand All @@ -45,19 +41,21 @@ torch::Tensor Integral::single_electron(libint2::Engine& engine, const libint2::
// define the number of basis functions, matrix of integrals and shell to basis function map
int nbf = shells.nbf(); std::vector<double> ints(nbf * nbf, 0); std::vector<size_t> sh2bf = shells.shell2bf();

// generate engine for every thread
std::vector<libint2::Engine> engines(nthread, engine);

// loop over all unique elements
for (size_t i = 0; i < shells.size(); i++) {
for (size_t j = i; j < shells.size(); j++) {
#pragma omp parallel for num_threads(nthread)
for (size_t i = 0; i < shells.size(); i++) for (size_t j = i; j < shells.size(); j++) {

// calculate the integral and skip if it is zero
int integral_index = 0; engine.compute(shells.at(i), shells.at(j)); if (engine.results().at(0) == nullptr) continue;
// calculate the integral and skip if it is zero
int integral_index = 0; engines.at(omp_get_thread_num()).compute(shells.at(i), shells.at(j)); if (engines.at(omp_get_thread_num()).results().at(0) == nullptr) continue;

// assign the integrals
for (size_t k = 0; k < shells.at(i).size(); k++) {
for (size_t l = 0; l < shells.at(j).size(); l++) {
ints.at((k + sh2bf.at(i)) * nbf + l + sh2bf.at(j)) = engine.results().at(0)[integral_index ];
ints.at((l + sh2bf.at(j)) * nbf + k + sh2bf.at(i)) = engine.results().at(0)[integral_index++];
}
// assign the integrals
for (size_t k = 0; k < shells.at(i).size(); k++) {
for (size_t l = 0; l < shells.at(j).size(); l++) {
ints.at((k + sh2bf.at(i)) * nbf + l + sh2bf.at(j)) = engines.at(omp_get_thread_num()).results().at(0)[integral_index ];
ints.at((l + sh2bf.at(j)) * nbf + k + sh2bf.at(i)) = engines.at(omp_get_thread_num()).results().at(0)[integral_index++];
}
}
}
Expand Down
Loading

0 comments on commit f702ff0

Please sign in to comment.