From 2598b56cfaec69eb7945d81ceac6f274a51b03a9 Mon Sep 17 00:00:00 2001
From: JacksonCampolattaro <jackcamp@vt.edu>
Date: Sun, 16 Jun 2024 19:41:39 +0200
Subject: [PATCH] Use forced-inlining to squeeze better performance out of the
 'einsum' version of the gravity derivatives.

---
 benchmarks/multipoleMoment.cpp          | 12 +++----
 include/symtensor/Index.h               |  2 +-
 include/symtensor/SymmetricTensorBase.h |  5 +--
 include/symtensor/gravity.h             | 44 ++++++++++++++++---------
 include/symtensor/util.h                | 30 +++++++++++++++++
 5 files changed, 68 insertions(+), 25 deletions(-)
diff --git a/benchmarks/multipoleMoment.cpp b/benchmarks/multipoleMoment.cpp
index 18aa461..b73ed97 100644
--- a/benchmarks/multipoleMoment.cpp
+++ b/benchmarks/multipoleMoment.cpp
@@ -29,12 +29,12 @@ TEST_CASE("benchmark: Gravity derivatives construction", "[Gravity]") {
     CHECK((gravity::D<3>(R) - gravity::derivative<3>(R)).norm() < 1e-7);
     CHECK((gravity::D<4>(R) - gravity::derivative<4>(R)).norm() < 1e-7);
 
-    //        BENCHMARK("D' Construction") { return gravity::D<1>(R); };
-    //        BENCHMARK("D' Construction *") { return gravity::derivative<1>(R); };
-    //        BENCHMARK("D'' Construction") { return gravity::D<2>(R); };
-    //        BENCHMARK("D'' Construction *") { return gravity::derivative<2>(R); };
-    //        BENCHMARK("D''' Construction") { return gravity::D<3>(R); };
-    //        BENCHMARK("D''' Construction *") { return gravity::derivative<3>(R); };
+    BENCHMARK("D' Construction") { return gravity::D<1>(R); };
+    BENCHMARK("D' Construction *") { return gravity::derivative<1>(R); };
+    BENCHMARK("D'' Construction") { return gravity::D<2>(R); };
+    BENCHMARK("D'' Construction *") { return gravity::derivative<2>(R); };
+    BENCHMARK("D''' Construction") { return gravity::D<3>(R); };
+    BENCHMARK("D''' Construction *") { return gravity::derivative<3>(R); };
     BENCHMARK("D'''' Construction") { return gravity::D<4>(R); };
     BENCHMARK("D'''' Construction *") { return gravity::derivative<4>(R); };
 }
diff --git a/include/symtensor/Index.h b/include/symtensor/Index.h
index c51edf9..16f8de7 100644
--- a/include/symtensor/Index.h
+++ b/include/symtensor/Index.h
@@ -238,7 +238,7 @@ namespace symtensor {
         if (R == 1) return static_cast<std::size_t>(dimensionalIndices[0]) - lowestIndex;
 
         // Ensure the indices are in the canonical order
-        std::sort(dimensionalIndices.begin(), dimensionalIndices.end());
+        dimensionalIndices = sorted(dimensionalIndices);
 
         if (dimensionalIndices[0] == I{lowestIndex}) {
             // If the first index is X, then we know we are in the first portion of the range,
diff --git a/include/symtensor/SymmetricTensorBase.h b/include/symtensor/SymmetricTensorBase.h
index 40477e9..27b7adb 100644
--- a/include/symtensor/SymmetricTensorBase.h
+++ b/include/symtensor/SymmetricTensorBase.h
@@ -215,15 +215,16 @@ namespace symtensor {
          * @return a symmetric tensor with each value set by evaluating the function.
          */
         template<typename F>
+        [[clang::always_inline, gnu::always_inline]]
         inline static constexpr Implementation NullaryExpression(F function = {}) {
             if constexpr (requires { function.template operator()<dimensionalIndices(0)>(); }) {
                 // If a function provides a template parameter for compile-time indexing, prefer that
-                return [&]<std::size_t... i>(std::index_sequence<i...>) constexpr {
+                return [&]<std::size_t... i>(std::index_sequence<i...>) __attribute__((always_inline)) constexpr {
                     return Implementation{static_cast<S>(function.template operator()<dimensionalIndices(i)>())...};
                 }(std::make_index_sequence<NumUniqueValues>());
             } else {
                 // Otherwise, the function must take the indices as its only argument
-                return [&]<std::size_t... i>(std::index_sequence<i...>) constexpr {
+                return [&]<std::size_t... i>(std::index_sequence<i...>) __attribute__((always_inline)) constexpr {
                     return Implementation{static_cast<S>(function(dimensionalIndices(i)))...};
                 }(std::make_index_sequence<NumUniqueValues>());
             }
diff --git a/include/symtensor/gravity.h b/include/symtensor/gravity.h
index 4eef151..10b7175 100644
--- a/include/symtensor/gravity.h
+++ b/include/symtensor/gravity.h
@@ -10,6 +10,8 @@
 
 namespace symtensor::gravity {
 
+
+
     template<auto index, indexable Vector>
     inline constexpr auto product_of_elements(const Vector &v) {
         // todo: maybe don't use std::apply for this?
@@ -31,12 +33,19 @@ namespace symtensor::gravity {
     }
 
     template<auto index, std::size_t N, indexable Vector>
-    inline auto derivative_at(const Vector &R, const auto &g) {
+    [[clang::always_inline, gnu::always_inline]] inline auto derivative_at(const Vector &R) {
+
+        auto r = glm::length(R);
+        auto g1 = -1.0f / pow<3>(r);
+        auto g2 = 3.0f / pow<5>(r);
+        auto g3 = -15.0f / pow<7>(r);
+        auto g4 = 105.0f / pow<9>(r);
+
         auto cartesian_product_term = product_of_elements<index>(R);
         if constexpr (N == 1) {
-            return R[static_cast<std::size_t>(index[0])] * g[1];
+            return R[static_cast<std::size_t>(index[0])] * g1;
         } else if constexpr (N == 2) {
-            return g[1] * kronecker_delta<float>(index) + g[2] * cartesian_product_term;
+            return g1 * kronecker_delta<float>(index) + g2 * cartesian_product_term;
         } else if constexpr (N == 3) {
             constexpr auto partitions = binomial_partitions<1>(index);
             auto kronecker_product_term = [&]<auto... i>(std::index_sequence<i...>) constexpr {
@@ -53,9 +62,9 @@ namespace symtensor::gravity {
             //                        R[static_cast<std::size_t>(std::get<0>(partition)[0])] : 0
             //                ) + ...);
             //            }, partitions);
-            return g[2] * kronecker_product_term + g[3] * cartesian_product_term;
+            return g2 * kronecker_product_term + g3 * cartesian_product_term;
         } else if constexpr (N == 4) {
-            auto result = g[4] * cartesian_product_term;
+            auto result = g4 * cartesian_product_term;
             constexpr auto partitions = binomial_partitions<2>(index);
             {
                 constexpr auto kronecker_term = [&]<auto... i>(std::index_sequence<i...>) constexpr {
@@ -65,7 +74,7 @@ namespace symtensor::gravity {
                     ) + ...);
                 }(std::make_index_sequence<partitions.size() / 2>());
                 if constexpr (kronecker_term > 0)
-                    result += g[2] * kronecker_term;
+                    result += g2 * kronecker_term;
             }
             {
                 //                constexpr auto nonzero_index_pairs = filter<partitions, [](auto p) constexpr {
@@ -93,7 +102,7 @@ namespace symtensor::gravity {
                     ) + ...);
                 }(std::make_index_sequence<partitions.size()>());
                 if constexpr (kronecker_product_is_nonzero)
-                    result += g[3] * kronecker_product_term;
+                    result += g3 * kronecker_product_term;
             }
 
             return result;
@@ -102,16 +111,10 @@ namespace symtensor::gravity {
     }
 
     template<std::size_t N, indexable Vector>
+    [[clang::always_inline, gnu::always_inline]]
     inline auto derivative(const Vector &R) {
-        auto r = glm::length(R);
-        // todo: factor these out into their own function
-        auto g1 = -1.0f / pow<3>(r);
-        auto g2 = 3.0f / pow<5>(r);
-        auto g3 = -15.0f / pow<7>(r);
-        auto g4 = 105.0f / pow<9>(r);
-        auto g_coeffs = std::array<decltype(r), 5>{1.0, g1, g2, g3, g4};
-        return SymmetricTensor3f<N>::NullaryExpression([&]<auto index>() constexpr {
-            return derivative_at<index, N>(R, g_coeffs);
+        return SymmetricTensor3f<N>::NullaryExpression([&]<auto index>()  __attribute__((always_inline)) constexpr {
+            return derivative_at<index, N>(R);
         });
 
     };
@@ -199,6 +202,15 @@ namespace symtensor::gravity {
         }
     }
 
+
+    auto derivative4(const glm::vec3 &R) {
+        return derivative<4>(R);
+    }
+
+    auto D4(const glm::vec3 &R) {
+        return D<4>(R);
+    }
+
 }
 
 #endif //SYMTENSOR_GRAVITY_H
diff --git a/include/symtensor/util.h b/include/symtensor/util.h
index f7426b9..e9935a2 100644
--- a/include/symtensor/util.h
+++ b/include/symtensor/util.h
@@ -85,6 +85,12 @@ namespace symtensor {
         }
     }
 
+    template<std::size_t N, typename T>
+    inline constexpr std::array<T, N> sorted(std::array<T, N> array) {
+        std::sort(array.begin(), array.end());
+        return array;
+    }
+
     template<typename T, std::size_t NA, std::size_t NB>
     constexpr std::array<T, NA + NB> concatenate(const std::array<T, NA> &a, const std::array<T, NB> &b) {
         std::array<T, NA + NB> result{};
@@ -271,6 +277,8 @@ namespace symtensor {
                 else
                     *(b_out++) = set[i];
             }
+            get<0>(*pair_out) = sorted(get<0>(*pair_out));
+            get<1>(*pair_out) = sorted(get<1>(*pair_out));
 
             pair_out++;
         } while (std::prev_permutation(selection.begin(), selection.end()));
@@ -371,6 +379,28 @@ namespace symtensor {
         return sum;
     }
 
+    template<typename T, std::size_t N>
+    constexpr std::array<std::size_t, N> countAppearances(const std::array<T, N> &arr) {
+        std::array<std::size_t, N> counts = {};
+        for (std::size_t i = 0; i < N; ++i) {
+            for (std::size_t j = 0; j < N; ++j) {
+                if (arr[i] == arr[j]) {
+                    ++counts[i];
+                }
+            }
+        }
+        return counts;
+    }
+
+    template<typename T, std::size_t N>
+    constexpr std::size_t numUniquePermutations(const std::array<T, N> &arr) {
+        const auto counts = countOccurrences(arr);
+        unsigned long long divisor = 1;
+        for (std::size_t i = 0; i < N; ++i)
+            divisor *= factorial(counts[i]);
+        return factorial(N) / divisor;
+    }
+
     template<typename F, typename T, std::size_t N>
     inline constexpr auto deduplicated_sum(const std::array<T, N> &values, F function = {}) {
         // todo: This may be very important for good performance!