Merge branch 'release-2.4.0'

cp2k · Aug 29, 2022 · a9d653e · a9d653e
2 parents c134996 + c262d2f
commit a9d653e
Show file tree

Hide file tree

Showing 22 changed files with 717 additions and 268 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,21 +5,21 @@ exclude: '^tools/(build_utils/fypp)'
 fail_fast: false
 repos:
 - repo: https://github.com/ambv/black
-  rev: 22.3.0
+  rev: 22.6.0
   hooks:
   - id: black
     name: Reformat Python files with the black code formatter
     files: '^.*(/PACKAGE)|(\.py)$'
-- repo: https://gitlab.com/pycqa/flake8
-  rev: 4.0.1
+- repo: https://github.com/pycqa/flake8
+  rev: 5.0.4
   hooks:
   - id: flake8
     exclude: >-
       (?x)^(
         .cp2k/.*|
       )$
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.1.0
+  rev: v4.3.0
   hooks:
   - id: check-ast
   - id: check-yaml

diff --git a/VERSION b/VERSION
@@ -1,8 +1,8 @@
 MAJOR = 2
-MINOR = 3
+MINOR = 4
 PATCH = 0
 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
 # it is considered Development version.
-DATE  = 2022-06-26
+DATE  = 2022-08-29
 
 
diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md
@@ -71,7 +71,7 @@ make
 -DWITH_CUDA_PROFILING=<OFF|ON>
 -DWITH_C_API=<ON|OFF>
 -DWITH_EXAMPLES=<ON|OFF>
--DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50|Mi100>
+-DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50|Mi100|Mi250>
 -DCMAKE_BUILD_TYPE=<Release|Debug|Coverage>
 -DBUILD_TESTING=<ON|OFF>
 -DTEST_MPI_RANKS=<auto,N>

diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c
@@ -25,6 +25,19 @@
 #  else
 #    define ACC_BENCH_USEOMP(FUNC) (FUNC)
 #  endif
+#  if LIBXSMM_VERSION4(1, 17, 0, 2776) <= LIBXSMM_VERSION_NUMBER
+#    define ACC_BENCH_GEMM_BATCH(IPREC, OPREC, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, STRIDE_A, B, LDB, STRIDE_B, BETA, C, LDC, \
+      STRIDE_C, INDEX_STRIDE, INDEX_BASE, BATCHSIZE) \
+      ACC_BENCH_USEOMP(libxsmm_gemm_batch) \
+      (IPREC, OPREC, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, STRIDE_A, B, LDB, STRIDE_B, BETA, C, LDC, STRIDE_C, INDEX_STRIDE, \
+        INDEX_BASE, BATCHSIZE, 0 /*batchcheck*/)
+#  else
+#    define ACC_BENCH_GEMM_BATCH(IPREC, OPREC, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, STRIDE_A, B, LDB, STRIDE_B, BETA, C, LDC, \
+      STRIDE_C, INDEX_STRIDE, INDEX_BASE, BATCHSIZE) \
+      ACC_BENCH_USEOMP(libxsmm_gemm_batch) \
+      ((libxsmm_gemm_precision)(IPREC), (libxsmm_gemm_precision)(OPREC), TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, \
+        LDC, INDEX_BASE, INDEX_STRIDE, STRIDE_A, STRIDE_B, STRIDE_C, BATCHSIZE)
+#  endif
 #  define PRINTF(...) \
     do { \
       const size_t print_buffer_size = sizeof(print_buffer) - print_offset; \
@@ -420,19 +433,17 @@ int main(int argc, char* argv[]) {
 #    endif
         memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
         for (r = 0; r < warmup; ++r) {
-          ACC_BENCH_USEOMP(libxsmm_gemm_batch)
-          (LIBXSMM_DATATYPE(ELEM_TYPE), LIBXSMM_DATATYPE(ELEM_TYPE), &transa, &transb, m, n, k, &alpha, amat_hst, &m /*lda*/,
-            bmat_hst, &k /*ldb*/, &beta, gold_hst, &m /*ldc*/, 1 /*index_base*/, sizeof(int) * 3, stack_hst + 0, stack_hst + 1,
-            stack_hst + 2, stack_size);
+          ACC_BENCH_GEMM_BATCH(LIBXSMM_DATATYPE(ELEM_TYPE), LIBXSMM_DATATYPE(ELEM_TYPE), &transa, &transb, m, n, k, &alpha,
+            amat_hst, &m /*lda*/, stack_hst + 0 /*stride_a*/, bmat_hst, &k /*ldb*/, stack_hst + 1 /*stride_b*/, &beta, gold_hst,
+            &m /*ldc*/, stack_hst + 2 /*stride_c*/, sizeof(int) * 3, 1 /*index_base*/, stack_size);
         }
         memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
         start = libxsmm_timer_tick();
         /* CPU-kernel operates on data that is not initialized in NUMA-aware fashion */
         for (r = 0; r < (nrepeat * smm_nrepeat); ++r) {
-          ACC_BENCH_USEOMP(libxsmm_gemm_batch)
-          (LIBXSMM_DATATYPE(ELEM_TYPE), LIBXSMM_DATATYPE(ELEM_TYPE), &transa, &transb, m, n, k, &alpha, amat_hst, &m /*lda*/,
-            bmat_hst, &k /*ldb*/, &beta, gold_hst, &m /*ldc*/, 1 /*index_base*/, sizeof(int) * 3, stack_hst + 0, stack_hst + 1,
-            stack_hst + 2, stack_size);
+          ACC_BENCH_GEMM_BATCH(LIBXSMM_DATATYPE(ELEM_TYPE), LIBXSMM_DATATYPE(ELEM_TYPE), &transa, &transb, m, n, k, &alpha,
+            amat_hst, &m /*lda*/, stack_hst + 0 /*stride_a*/, bmat_hst, &k /*ldb*/, stack_hst + 1 /*stride_b*/, &beta, gold_hst,
+            &m /*ldc*/, stack_hst + 2 /*stride_c*/, sizeof(int) * 3, 1 /*index_base*/, stack_size);
         }
         duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
         PRINTF("host: %.2g ms %.1f GFLOPS/s\n", 1000.0 * duration / (nrepeat * smm_nrepeat),

diff --git a/src/acc/libsmm_acc/generate_kernels.py b/src/acc/libsmm_acc/generate_kernels.py
@@ -9,12 +9,9 @@
 # SPDX-License-Identifier: GPL-2.0+                                                                #
 ####################################################################################################
 
-from __future__ import print_function
-
-import os
-from os import path
 import re
 import argparse
+from pathlib import Path
 
 # ===============================================================================
 # Helper variables
@@ -29,66 +26,63 @@
 commented_line = r"\s*(//|/\*.*/*/)"
 open_comment = r"\s*/\*"
 close_comment = r".*\*/"
-smm_acc_header = (
-    "/*------------------------------------------------------------------------------------------------*\n"
-    + " * Copyright (C) by the DBCSR developers group - All rights reserved                              *\n"
-    + " * This file is part of the DBCSR library.                                                        *\n"
-    + " *                                                                                                *\n"
-    + " * For information on the license, see the LICENSE file.                                          *\n"
-    + " * For further information please visit https://dbcsr.cp2k.org                                    *\n"
-    + " * SPDX-License-Identifier: GPL-2.0+                                                              *\n"
-    + " *------------------------------------------------------------------------------------------------*/\n"
-    + "\n"
-    + "/*****************************************************************************\n"
-    + " *  FILE GENERATED BY SCRIPT 'generate_kernels.py' DO NOT EDIT             *\n"
-    + " *****************************************************************************/\n"
-    + "\n"
-    + "#ifndef SMM_ACC_H\n"
-    + "#define SMM_ACC_H\n"
-    + "#include <string>\n"
-)
+smm_acc_header = """\
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+
+/*****************************************************************************
+ *  FILE GENERATED BY SCRIPT 'generate_kernels.py' DO NOT EDIT               *
+ *****************************************************************************/
+
+#ifndef SMM_ACC_H
+#define SMM_ACC_H
+#include <string>
+"""
 
 
 # ===============================================================================
-def main(kernels_folder):
+def main(kernels_folder: Path):
     """
     Find files corresponding to CUDA/HIP kernels and write them as strings into a
     C++ header file to be read for JIT-ing
     """
     # Find all files containing "smm_acc" kernels in the "kernel" subfolder
-    kernels_folder_files = os.listdir(kernels_folder)
+    kernels_folder_files = kernels_folder.iterdir()
     kernel_files = list()
-    for f in kernels_folder_files:
-        if f[:8] == "smm_acc_" and f[-2:] == ".h":
-            kernel_files.append(os.path.join(kernels_folder, f))
-    print("Found {} kernel files:".format(len(kernel_files)))
-    print(*("<- {}".format(kf) for kf in kernel_files), sep="\n")
+    for kfile in kernels_folder_files:
+        if kfile.name.startswith("smm_acc_") and kfile.suffix == ".h":
+            kernel_files.append(kfile)
+    print(f"Found {len(kernel_files)} kernel files:")
+    print(*(f"<- {kf}" for kf in kernel_files), sep="\n")
 
     # Read
     kernels_h = (
         dict()
     )  # key: path to kernel file (string), value: file content (list of string)
     for kernel_file in kernel_files:
-        with open(kernel_file) as f:
-            kernels_h[kernel_file] = f.read().splitlines()
+        kernels_h[kernel_file] = kernel_file.read_text().splitlines()
 
     # Construct file containing the kernels as strings
     print("Re-write kernels as strings...")
     file_h = smm_acc_header
     for kernel_file, kernel in kernels_h.items():
-        kernel_name, _ = path.splitext(
-            path.basename(kernel_file)
-        )  # use the filename as name for the kernel
-        file_h += "\n" + separator + cpp_function_to_string(kernel, kernel_name) + "\n"
+        kernel_name = kernel_file.stem  # use the filename as name for the kernel
+        file_h += f"\n{separator}{cpp_function_to_string(kernel, kernel_name)}\n"
     file_h += "#endif  // SMM_ACC_H\n"
     file_h += "//EOF"
     file_h += "\n\n"
 
     # Write
     file_h_path = "smm_acc_kernels.h"
-    with open(file_h_path, "w") as f:
-        f.write(file_h)
-    print("Wrote kernel string to file\n-> {}".format(file_h_path))
+    with open(file_h_path, "w") as fhandle:
+        fhandle.write(file_h)
+    print(f"Wrote kernel string to file\n-> {file_h_path}")
 
 
 # ===============================================================================
@@ -104,7 +98,7 @@ def cpp_function_to_string(cpp_file, kernel_name):
         r"^[a-zA-Z]\w*", kernel_name
     ), "kernel_name must be a valid C/C++ variable name"
 
-    out = variable_declaration.format(var_name=kernel_name) + "\n"
+    out = f"{variable_declaration.format(var_name=kernel_name)}\n"
     in_comment = False
     for line in cpp_file:
         if not in_comment:
@@ -124,11 +118,8 @@ def cpp_function_to_string(cpp_file, kernel_name):
                     )
                     + "\n"
                 )
-        else:  # in_comment == True
-            if re.match(close_comment, line) is not None:
-                in_comment = False
-            else:
-                pass
+        elif re.match(close_comment, line):  # in_comment == True
+            in_comment = False
 
     return out + end_string
 
@@ -138,7 +129,7 @@ def cpp_function_to_string(cpp_file, kernel_name):
     parser.add_argument(
         "kernels_folder",
         metavar="KERNELS_FOLDER",
-        type=str,
+        type=Path,
         nargs="?",
         default="./kernels",
         help="directory with the kernel header files. Default: %(default)s",

diff --git a/src/acc/libsmm_acc/generate_parameters.py b/src/acc/libsmm_acc/generate_parameters.py
@@ -9,49 +9,47 @@
 # SPDX-License-Identifier: GPL-2.0+                                                                #
 ####################################################################################################
 
-from __future__ import print_function
-
 import json
 import argparse
-from os import path
+from pathlib import Path
 
 from kernels.smm_acc import params_dict_to_kernel, gpu_architectures
 
 
 # ===============================================================================
-def main(gpu_version, base_dir):
+def main(gpu_version: str, base_dir: Path):
+    param_fn = base_dir / f"parameters_{gpu_version}.json"
+
     try:  # Read existing parameters
-        param_fn = path.join(base_dir, "parameters_{}.json".format(gpu_version))
-        with open(param_fn) as f:
-            print("GPU version: {}".format(gpu_version))
-            all_kernels = [params_dict_to_kernel(**params) for params in json.load(f)]
-        print(
-            "About to process {:,} kernels from file {}".format(
-                len(all_kernels), param_fn
-            )
-        )
+        with param_fn.open("r") as fhandle:
+            print(f"GPU version: {gpu_version}")
+            all_kernels = [
+                params_dict_to_kernel(**params) for params in json.load(fhandle)
+            ]
+        print(f"About to process {len(all_kernels):,} kernels from file {param_fn}")
     except:  # noqa: E722
         all_kernels = []
         pass
 
     try:  # Read GPU properties (warp size)
-        gpu_props_fn = path.join(base_dir, "../kernels/gpu_properties.json")
-        arch_code = gpu_architectures[path.basename(param_fn)]
-        with open(gpu_props_fn) as f:
-            gpu_warp_size = json.load(f)[arch_code]["Threads_/_Warp"]
+        gpu_props_fn = base_dir / "../kernels/gpu_properties.json"
+        arch_code = gpu_architectures[param_fn.name]
+        with gpu_props_fn.open("r") as fhandle:
+            gpu_warp_size = json.load(fhandle)[arch_code]["Threads_/_Warp"]
     except:  # noqa: E722
         gpu_warp_size = 32
         pass
-    print("GPU warp size: {}".format(gpu_warp_size))
+
+    print(f"GPU warp size: {gpu_warp_size}")
 
     # Construct output
-    out, all_pars = write_parameters_file(all_kernels, gpu_warp_size)
+    out = write_parameters_file(all_kernels, gpu_warp_size)
 
     # Write to c++ header-file
     file_h = "parameters.h"
     if all_kernels:
-        print("Found {:,} kernels in file {}".format(len(all_kernels), param_fn))
-    print("Printing them to file {}".format(file_h))
+        print(f"Found {len(all_kernels):,} kernels in file {param_fn}")
+    print(f"Printing them to file {file_h}")
     with open(file_h, "w") as f:
         f.write(out)
 
@@ -103,7 +101,7 @@ def write_parameters_file(all_pars, gpu_warp_size):
 """
 
     # Warp size
-    out += "extern const int warp_size = {};\n\n".format(gpu_warp_size)
+    out += f"extern const int warp_size = {gpu_warp_size};\n\n"
 
     # Map of kernel parameters
     out += """\
@@ -125,10 +123,9 @@ def write_parameters_file(all_pars, gpu_warp_size):
 
 #endif
 //EOF
-\n\n
 """
 
-    return out, all_pars
+    return out
 
 
 # ===============================================================================
@@ -149,6 +146,7 @@ def write_parameters_file(all_pars, gpu_warp_size):
         "--base_dir",
         metavar="BASE_DIR",
         default="parameters/",
+        type=Path,
         help="Set the base directory to look for the parameter files. Default: %(default)s",
     )
     args = parser.parse_args()

diff --git a/src/acc/libsmm_acc/kernels/smm_acc.py b/src/acc/libsmm_acc/kernels/smm_acc.py
@@ -58,6 +58,7 @@
     "parameters_K80.json": "sm_37",
     "parameters_P100.json": "sm_60",
     "parameters_V100.json": "sm_70",
+    "parameters_A100.json": "sm_80",
     "parameters_Vega10.json": "gfx900",
     "parameters_Mi50.json": "gfx906",
     "parameters_Mi100.json": "gfx908",
@@ -82,7 +83,7 @@ def compatible_mnk(algo, m, n, k):
             compatible = False
     else:
         if algo != "medium":
-            assert False, "Cannot identify algorithm:" + str(algo)
+            assert False, f"Cannot identify algorithm:{str(algo)}"
 
     return compatible
 
@@ -122,15 +123,18 @@ def descr_to_kernel(kernel_descr, source="autotuned"):
         r"Kernel_dnt_(\w+)(\(.*\)) , # (\d+(?:\.\d+)?) GFlop/s"
     )
     kernel_descr_matched = re_kernel_descr.search(kernel_descr)
-    assert kernel_descr_matched is not None, (
-        'Could not match kernel description in "' + kernel_descr + '"'
-    )
+    assert (
+        kernel_descr_matched is not None
+    ), f'Could not match kernel description in "{kernel_descr}"'
     match = kernel_descr_matched.groups()
     algo = match[0]
-    m = match[1].replace("=", "':")
-    m = m.replace(", ", ", '")
-    m = m.replace("(", "{'")
-    m = m.replace(")", "}")
+    m = (
+        match[1]
+        .replace("=", "':")
+        .replace(", ", ", '")
+        .replace("(", "{'")
+        .replace(")", "}")
+    )
     params = dict(literal_eval(m))
     params["perf"] = float(match[2])
     params["source"] = source