parallelize some stuff

tjira · Sep 8, 2024 · f702ff0 · f702ff0
1 parent d858638
commit f702ff0
Show file tree

Hide file tree

Showing 12 changed files with 74 additions and 91 deletions.
diff --git a/education/python/resmet.py b/education/python/resmet.py
@@ -39,7 +39,6 @@
     parser.add_argument("--ccd", help="Perform the doubles coupled clusters calculation.", action=ap.BooleanOptionalAction)
     parser.add_argument("--ccsd", help="Perform the singles/doubles coupled clusters calculation.", action=ap.BooleanOptionalAction)
     parser.add_argument("--ccsd-t", help="Perform the singles/doubles coupled clusters calculation with third order excitations included using perturbation theory.", action=ap.BooleanOptionalAction)
-    parser.add_argument("--lccd", help="Perform the linearized doubles coupled clusters calculation.", action=ap.BooleanOptionalAction)
     parser.add_argument("--mp2", help="Perform the second order Moller-Plesset calculation.", action=ap.BooleanOptionalAction)
     parser.add_argument("--mp3", help="Perform the third order Moller-Plesset calculation.", action=ap.BooleanOptionalAction)
 
@@ -155,26 +154,6 @@
         # initialize the first guess for the t-amplitudes
         t1, t2 = np.zeros((2 * nvirt, 2 * nocc)), Jmsa[v, v, o, o] * Emsd
 
-        # LCCD loop
-        if args.lccd:
-            while abs(E_LCCD - E_LCCD_P) > args.threshold:
-                # collect all the distinct terms
-                lccd1 = 0.5 * np.einsum("abcd,cdij->abij", Jmsa[v, v, v, v], t2, optimize=True)
-                lccd2 = 0.5 * np.einsum("klij,abkl->abij", Jmsa[o, o, o, o], t2, optimize=True)
-                lccd3 =       np.einsum("akic,bcjk->abij", Jmsa[v, o, o, v], t2, optimize=True)
-
-                # apply the permuation operator and add it to the corresponding term
-                lccd3 = lccd3 + lccd3.transpose(1, 0, 3, 2) - lccd3.transpose(1, 0, 2, 3) - lccd3.transpose(0, 1, 3, 2)
-
-                # update the t-amplitudes
-                t2 = Emsd * (Jmsa[v, v, o, o] + lccd1 + lccd2 + lccd3)
-
-                # evaluate the energy
-                E_LCCD_P, E_LCCD = E_LCCD, (1 / 4) * np.einsum("ijab,abij", Jmsa[o, o, v, v], t2, optimize=True)
-
-            # print the LCCD energy
-            print("   LCCD ENERGY: {:.8f}".format(E_HF + E_LCCD + VNN))
-
         # CCD loop
         if args.ccd:
             while abs(E_CCD - E_CCD_P) > args.threshold:

diff --git a/include/hartreefock.h b/include/hartreefock.h
@@ -12,7 +12,7 @@ class HartreeFock {
     double get_energy(const torch::Tensor& H_AO, const torch::Tensor& F_AO, const torch::Tensor& D_MO) const;
     torch::Tensor get_fock(const torch::Tensor& H_AO, const torch::Tensor& J_AO, const torch::Tensor& D_MO) const;
     std::tuple<torch::Tensor, torch::Tensor, double> run(const System& system, const torch::Tensor& H_AO, const torch::Tensor& S_AO, const torch::Tensor& J_AO, torch::Tensor D_MO) const;
-    std::tuple<torch::Tensor, torch::Tensor, double> scf(const System& system, const torch::Tensor& H_AO, const torch::Tensor& S_AO, const torch::Tensor& J_AO, torch::Tensor D_MO) const;
+    std::tuple<torch::Tensor, torch::Tensor, double> scf(const System& system, const torch::Tensor& H, const torch::Tensor& S, const torch::Tensor& J, torch::Tensor D) const;
 
 private:
     Input::HartreeFock input;

diff --git a/include/input.h b/include/input.h
@@ -49,7 +49,7 @@ struct Input {
     } classical_dynamics;
 };
 
-extern nlohmann::json default_input;
+extern nlohmann::json default_input; extern int nthread;
 
 NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Input::System, basis, path, charge, multiplicity);
 NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Input::Wavefunction, dimension, mass, momentum, grid_limits, grid_points, guess);

diff --git a/include/integral.h b/include/integral.h
@@ -6,7 +6,7 @@
 
 class Integral {
 public:
-    Integral(const Input::Integral& input);
+    Integral(const Input::Integral& input) : precision(input.precision) {}
 
     static torch::Tensor double_electron(libint2::Engine& engine, const libint2::BasisSet& shells);
     static torch::Tensor single_electron(libint2::Engine& engine, const libint2::BasisSet& shells);

diff --git a/script/libfftw.sh b/script/libfftw.sh
@@ -7,7 +7,7 @@ mkdir -p external && wget -O external/libfftw.tar.gz https://www.fftw.org/fftw-3
 cd external && rm -rf libfftw && tar -xzvf libfftw.tar.gz && mv fftw* libfftw && cd -
 
 # configure FFTW
-cd external/libfftw && cmake -B build -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$PWD/install" && cd -
+cd external/libfftw && cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX="$PWD/install" && cd -
 
 # compile and install FFTW
 cd external/libfftw && cmake --build build --parallel 2 && cmake --install build && cd -
@@ -18,8 +18,5 @@ cp -r external/libfftw/install/include external/libfftw/install/lib external/
 # remove redundant files
 rm -rf external/libfftw external/include/*.f external/include/*.f03 external/lib/cmake external/lib/pkgconfig
 
-# rename the shared library
-cd external/lib && [[ -f libfftw3.so.3 ]] && mv libfftw3.so.3 libfftw.so && patchelf --set-soname libfftw.so libfftw.so && rm libfftw3.* ; cd -
-
 # rename the static library
-cd external/lib && [[ -f libfftw3.a ]] && mv libfftw3.a libfftw.a ; cd -
+cd external/lib && mv libfftw3.a libfftw.a ; cd -
diff --git a/script/libint.sh b/script/libint.sh
@@ -1,25 +1,16 @@
 #!/bin/bash
 
 # download libint
-mkdir -p external && wget -O external/libint.tgz https://github.com/evaleev/libint/releases/download/v2.9.0/libint-2.9.0-mpqc4.tgz
+mkdir -p external && git clone https://github.com/evaleev/libint.git external/libint
 
-# unpack libint
-cd external && rm -rf libint && tar -xvf libint.tgz --warning=no-unknown-keyword && mv libint-* libint && cd -
-
-# configure libint
-cd external/libint && cmake -B build -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$PWD/install" && cd -
-
-# compile and install libint
-cd external/libint && cmake --build build --parallel 2 && cmake --install build && cd -
+# configure, compile and install libint
+cd external/libint && ./autogen.sh && ./configure CXX=g++ CXXFLAGS="-march=native -s -O3 -flto=auto" --prefix="$PWD/install" --enable-1body=1 --enable-eri=1 --with-max-am=6 && make -j2 && make install && cd -
 
 # copy the compiled library
 cp -r external/libint/install/include external/libint/install/lib external/
 
 # remove redundant files
-rm -rf external/libint external/lib/cmake external/lib/pkgconfig
-
-# rename the shared library
-cd external/lib && [[ -f libint2.so ]] && mv libint2.so.2.9.0 libint.so && patchelf --set-soname libint.so libint.so && rm libint2.* ; cd -
+rm -rf external/libint external/lib/*.la external/lib/pkgconfig
 
 # rename the library
-cd external/lib && [[ -f libint2.a ]] && mv libint2.a libint.a ; cd -
+cd external/lib && mv libint2.a libint.a ; cd -
diff --git a/src/configurationinteraction.cpp b/src/configurationinteraction.cpp
@@ -85,6 +85,7 @@ std::tuple<torch::Tensor, torch::Tensor> ConfigurationInteraction::run(const Sys
     Timepoint hamiltonian_timer = Timer::Now();
 
     // fill the CI Hamiltonian
+    #pragma omp parallel for num_threads(nthread)
     for (size_t i = 0; i < dets.size(); i++) {
         for (size_t j = i; j < dets.size(); j++) {
 

diff --git a/src/hartreefock.cpp b/src/hartreefock.cpp
@@ -37,12 +37,12 @@ std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::run(const System&
     return input.generalized ? scf(system, H_AS, S_AS, J_AS, D_MS) : scf(system, H_AO, S_AO, J_AO, D_MO);
 }
 
-std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System& system, const torch::Tensor& H_AO, const torch::Tensor& S_AO, const torch::Tensor& J_AO, torch::Tensor D_MO) const {
+std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System& system, const torch::Tensor& H, const torch::Tensor& S, const torch::Tensor& J, torch::Tensor D) const {
     // define the Fock matrix, orbital coefficients, electron energy and containers for errors and Fock matrices
-    torch::Tensor F_AO = H_AO, C_MO = torch::zeros_like(D_MO); double energy_hf = 0; std::vector<torch::Tensor> errors, focks;
+    torch::Tensor F = H, C = torch::zeros_like(D); double energy_hf = 0; std::vector<torch::Tensor> errors, focks;
 
     // calculate the complementary X matrix as square root of the inverse of the overlap matrix
-    auto [SVAL, SVEC] = torch::linalg::eigh(S_AO, "L"); torch::Tensor X = SVEC.mm(torch::diag(1 / torch::sqrt(SVAL))).mm(SVEC.swapaxes(0, 1));
+    auto [SVAL, SVEC] = torch::linalg::eigh(S, "L"); torch::Tensor X = SVEC.mm(torch::diag(1 / torch::sqrt(SVAL))).mm(SVEC.swapaxes(0, 1));
 
     // print the header
     std::printf("%s HF ENERGY CALCULATION\n%6s %20s %8s %8s %12s\n", input.generalized ? "GENERALIZED" : "RESTRICTED", "ITER", "ENERGY", "|dE|", "|dD|", "TIMER");
@@ -54,10 +54,10 @@ std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System&
         Timepoint iteration_timer = Timer::Now();
 
         // calculate the Fock matrix and save the previous values
-        F_AO = get_fock(H_AO, J_AO, D_MO); torch::Tensor D_MO_P = D_MO; double energy_hf_prev = energy_hf;
+        F = get_fock(H, J, D); torch::Tensor D_P = D; double energy_hf_prev = energy_hf;
 
         // calculate the error vector and append it to the container along with the Fock matrix
-        torch::Tensor error = S_AO.mm(D_MO).mm(F_AO) - F_AO.mm(D_MO).mm(S_AO); if (i) errors.push_back(error), focks.push_back(F_AO);
+        torch::Tensor error = S.mm(D).mm(F) - F.mm(D).mm(S); if (i) errors.push_back(error), focks.push_back(F);
 
         // truncate the error and Fock vector containers if they exceed the DIIS size
         if (i > input.diis_size) errors.erase(errors.begin()), focks.erase(focks.begin());
@@ -79,30 +79,30 @@ std::tuple<torch::Tensor, torch::Tensor, double> HartreeFock::scf(const System&
             torch::Tensor c = torch::linalg::solve(B, b, true);
 
             // extrapolate the Fock matrix
-            F_AO = c.index({0}) * focks.at(0); for (int j = 1; j < input.diis_size; j++) F_AO += c.index({j}) * focks.at(j);
+            F = c.index({0}) * focks.at(0); for (int j = 1; j < input.diis_size; j++) F += c.index({j}) * focks.at(j);
         }
 
         // solve the Roothaan equations
-        torch::Tensor E_MO; std::tie(E_MO, C_MO) = torch::linalg::eigh(X.mm(F_AO).mm(X), "L"); C_MO = X.mm(C_MO);
+        torch::Tensor E; std::tie(E, C) = torch::linalg::eigh(X.mm(F).mm(X), "L"); C = X.mm(C);
 
         // calculate the new density and energy
-        D_MO = get_density(system, C_MO), energy_hf = get_energy(H_AO, F_AO, D_MO);
+        D = get_density(system, C), energy_hf = get_energy(H, F, D);
 
         // calculate the errors
-        double error_energy = std::abs(energy_hf - energy_hf_prev), error_density = (D_MO - D_MO_P).norm().item<double>();
+        double error_energy = std::abs(energy_hf - energy_hf_prev), error_density = (D - D_P).norm().item<double>();
 
         // print the iteration info
         std::printf("%6d %20.14f %.2e %.2e %s %s\n", i + 1, energy_hf, error_energy, error_density, Timer::Format(Timer::Elapsed(iteration_timer)).c_str(), input.diis_size && i >= input.diis_size ? "DIIS" : "");
 
         // finish if covergence reached
-        if (std::abs(energy_hf - energy_hf_prev) < input.threshold && (D_MO - D_MO_P).norm().item<double>() < input.threshold) break;
+        if (std::abs(energy_hf - energy_hf_prev) < input.threshold && (D - D_P).norm().item<double>() < input.threshold) break;
 
         // throw an exception if the maximum number of iterations reached
         if (i == input.max_iter - 1) throw std::runtime_error("MAXIMUM NUMBER OF ITERATIONS IN THE SCF REACHED");
     }
 
     // transform the results to the spin basis
-    torch::Tensor C_MS = input.generalized ? C_MO : Transform::CoefficientSpin(C_MO); torch::Tensor F_MS = input.generalized ? Transform::SingleSpatial(F_AO, C_MS) : Transform::SingleSpin(F_AO, C_MS);
+    torch::Tensor C_MS = input.generalized ? C : Transform::CoefficientSpin(C); torch::Tensor F_MS = input.generalized ? Transform::SingleSpatial(F, C_MS) : Transform::SingleSpin(F, C_MS);
 
     // return the converged results
     return {F_MS, C_MS, energy_hf};

diff --git a/src/input.cpp b/src/input.cpp
@@ -82,3 +82,5 @@ nlohmann::json default_input = R"(
     }
 }
 )"_json;
+
+int nthread = 1;
diff --git a/src/integral.cpp b/src/integral.cpp
@@ -1,36 +1,32 @@
 #include "integral.h"
 
-Integral::Integral(const Input::Integral& input) : precision(input.precision) {}
-
 torch::Tensor Integral::double_electron(libint2::Engine& engine, const libint2::BasisSet& shells) {
     // define the number of basis functions, matrix of integrals and shell to basis function map
     int nbf = shells.nbf(); std::vector<double> ints(nbf * nbf * nbf * nbf, 0); std::vector<size_t> sh2bf = shells.shell2bf();
 
+    // generate engine for every thread
+    std::vector<libint2::Engine> engines(nthread, engine);
+
     // loop over all unique elements
-    for (size_t i = 0; i < shells.size(); i++) {
-        for (size_t j = i; j < shells.size(); j++) {
-            for (size_t k = i; k < shells.size(); k++) {
-                for (size_t l = 0; l < shells.size(); l++) {
-
-                    // calculate the integral and skip if it is zero
-                    int integral_index = 0; engine.compute(shells.at(i), shells.at(j), shells.at(k), shells.at(l)); if (engine.results().at(0) == nullptr) continue;
-
-                    // assign the integrals
-                    for (size_t m = 0; m < shells.at(i).size(); m++) {
-                        for (size_t n = 0; n < shells.at(j).size(); n++) {
-                            for (size_t o = 0; o < shells.at(k).size(); o++) {
-                                for (size_t p = 0; p < shells.at(l).size(); p++) {
-                                    ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engine.results().at(0)[integral_index  ];
-                                    ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engine.results().at(0)[integral_index  ];
-                                    ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engine.results().at(0)[integral_index  ];
-                                    ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engine.results().at(0)[integral_index  ];
-                                    ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engine.results().at(0)[integral_index  ];
-                                    ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engine.results().at(0)[integral_index  ];
-                                    ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engine.results().at(0)[integral_index  ];
-                                    ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engine.results().at(0)[integral_index++];
-                                }
-                            }
-                        }
+    #pragma omp parallel for num_threads(nthread)
+    for (size_t i = 0; i < shells.size(); i++) for (size_t j = i; j < shells.size(); j++) for (size_t k = i; k < shells.size(); k++) for (size_t l = (i == k ? j : k); l < shells.size(); l++) {
+
+        // calculate the integral and skip if it is zero
+        int integral_index = 0; engines.at(omp_get_thread_num()).compute(shells.at(i), shells.at(j), shells.at(k), shells.at(l)); if (engines.at(omp_get_thread_num()).results().at(0) == nullptr) continue;
+
+        // assign the integrals
+        for (size_t m = 0; m < shells.at(i).size(); m++) {
+            for (size_t n = 0; n < shells.at(j).size(); n++) {
+                for (size_t o = 0; o < shells.at(k).size(); o++) {
+                    for (size_t p = 0; p < shells.at(l).size(); p++) {
+                        ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                        ints.at((m + sh2bf.at(i)) * nbf * nbf * nbf + (n + sh2bf.at(j)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                        ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (o + sh2bf.at(k)) * nbf + (p + sh2bf.at(l))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                        ints.at((n + sh2bf.at(j)) * nbf * nbf * nbf + (m + sh2bf.at(i)) * nbf * nbf + (p + sh2bf.at(l)) * nbf + (o + sh2bf.at(k))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                        ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                        ints.at((o + sh2bf.at(k)) * nbf * nbf * nbf + (p + sh2bf.at(l)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                        ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (m + sh2bf.at(i)) * nbf + (n + sh2bf.at(j))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                        ints.at((p + sh2bf.at(l)) * nbf * nbf * nbf + (o + sh2bf.at(k)) * nbf * nbf + (n + sh2bf.at(j)) * nbf + (m + sh2bf.at(i))) = engines.at(omp_get_thread_num()).results().at(0)[integral_index++];
                     }
                 }
             }
@@ -45,19 +41,21 @@ torch::Tensor Integral::single_electron(libint2::Engine& engine, const libint2::
     // define the number of basis functions, matrix of integrals and shell to basis function map
     int nbf = shells.nbf(); std::vector<double> ints(nbf * nbf, 0); std::vector<size_t> sh2bf = shells.shell2bf();
 
+    // generate engine for every thread
+    std::vector<libint2::Engine> engines(nthread, engine);
+
     // loop over all unique elements
-    for (size_t i = 0; i < shells.size(); i++) {
-        for (size_t j = i; j < shells.size(); j++) {
+    #pragma omp parallel for num_threads(nthread)
+    for (size_t i = 0; i < shells.size(); i++) for (size_t j = i; j < shells.size(); j++) {
 
-            // calculate the integral and skip if it is zero
-            int integral_index = 0; engine.compute(shells.at(i), shells.at(j)); if (engine.results().at(0) == nullptr) continue;
+        // calculate the integral and skip if it is zero
+        int integral_index = 0; engines.at(omp_get_thread_num()).compute(shells.at(i), shells.at(j)); if (engines.at(omp_get_thread_num()).results().at(0) == nullptr) continue;
 
-            // assign the integrals
-            for (size_t k = 0; k < shells.at(i).size(); k++) {
-                for (size_t l = 0; l < shells.at(j).size(); l++) {
-                    ints.at((k + sh2bf.at(i)) * nbf + l + sh2bf.at(j)) = engine.results().at(0)[integral_index  ];
-                    ints.at((l + sh2bf.at(j)) * nbf + k + sh2bf.at(i)) = engine.results().at(0)[integral_index++];
-                }
+        // assign the integrals
+        for (size_t k = 0; k < shells.at(i).size(); k++) {
+            for (size_t l = 0; l < shells.at(j).size(); l++) {
+                ints.at((k + sh2bf.at(i)) * nbf + l + sh2bf.at(j)) = engines.at(omp_get_thread_num()).results().at(0)[integral_index  ];
+                ints.at((l + sh2bf.at(j)) * nbf + k + sh2bf.at(i)) = engines.at(omp_get_thread_num()).results().at(0)[integral_index++];
             }
         }
     }