Add SYCL implementation of GPU X buffer operations
authorAndrey Alekseenko <al42and@gmail.com>
Mon, 10 May 2021 20:09:53 +0000 (20:09 +0000)
committerArtem Zhmurov <zhmurov@gmail.com>
Mon, 10 May 2021 20:09:53 +0000 (20:09 +0000)
src/gromacs/nbnxm/CMakeLists.txt
src/gromacs/nbnxm/nbnxm_gpu_buffer_ops.cpp
src/gromacs/nbnxm/sycl/nbnxm_gpu_buffer_ops_internal_sycl.cpp [new file with mode: 0644]

index 51aa880f470ee7c8cb6890e6212cac4424e7675b..8759c679bc9d5649ba85062896648f54a7e80968 100644 (file)
@@ -81,8 +81,8 @@ endif()
 
 if(GMX_GPU_SYCL)
     add_subdirectory(sycl)
-    gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops_stubs.cpp)
-    _gmx_add_files_to_property(SYCL_SOURCES nbnxm_gpu_data_mgmt.cpp nbnxm.cpp)
+    gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops.cpp)
+    _gmx_add_files_to_property(SYCL_SOURCES nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops.cpp nbnxm.cpp)
 endif()
 
 set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${NBNXM_SOURCES} PARENT_SCOPE)
index 00a15a0c38412e0f629c84043256bd8cc0699932..d24babbfda6a7b6eb20ccf87fc9333aa427ee15f 100644 (file)
@@ -46,6 +46,8 @@
 #include "gromacs/gpu_utils/device_stream.h"
 #if GMX_GPU_CUDA
 #    include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#elif GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
 #endif
 #include "gromacs/mdtypes/locality.h"
 #include "gromacs/nbnxm/gridset.h"
@@ -53,6 +55,8 @@
 #include "gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h"
 #if GMX_GPU_CUDA
 #    include "gromacs/nbnxm/cuda/nbnxm_cuda_types.h"
+#elif GMX_GPU_SYCL
+#    include "gromacs/nbnxm/sycl/nbnxm_sycl_types.h"
 #endif
 #include "gromacs/utility/exceptions.h"
 
@@ -68,7 +72,8 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&      grid,
                            int                     numColumnsMax,
                            bool                    mustInsertNonLocalDependency)
 {
-    GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "nbnxn_gpu_x_to_nbat_x only supported with CUDA");
+    GMX_RELEASE_ASSERT(GMX_GPU_CUDA || GMX_GPU_SYCL,
+                       "nbnxn_gpu_x_to_nbat_x only supported with CUDA and SYCL");
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
     gmx::InteractionLocality interactionLoc = gmx::atomToInteractionLocality(locality);
 
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_gpu_buffer_ops_internal_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_gpu_buffer_ops_internal_sycl.cpp
new file mode 100644 (file)
index 0000000..d95bd6f
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  SYCL implementation of coordinate layout conversion kernel and its wrapper
+ *
+ *  \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/nbnxm/grid.h"
+#include "gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h"
+
+#include "nbnxm_sycl_types.h"
+
+using cl::sycl::access::mode;
+using cl::sycl::access::target;
+
+namespace Nbnxm
+{
+
+/*! \brief SYCL kernel for transforming position coordinates from rvec to nbnxm layout.
+ *
+ * \param         cgh                 SYCL's command group handler.
+ * \param[out]    a_xq                Coordinates buffer in nbnxm layout.
+ * \param[in]     a_x                 Coordinates buffer.
+ * \param[in]     a_atomIndex         Atom index mapping.
+ * \param[in]     a_numAtoms          Array of number of atoms.
+ * \param[in]     a_cellIndex         Array of cell indices.
+ * \param[in]     cellOffset          First cell.
+ * \param[in]     numAtomsPerCell     Number of atoms per cell.
+ * \param[in]     columnsOffset       Index if the first column in the cell.
+ */
+static auto nbnxmKernelTransformXToXq(cl::sycl::handler&                       cgh,
+                                      DeviceAccessor<Float4, mode::read_write> a_xq,
+                                      DeviceAccessor<Float3, mode::read>       a_x,
+                                      DeviceAccessor<int, mode::read>          a_atomIndex,
+                                      DeviceAccessor<int, mode::read>          a_numAtoms,
+                                      DeviceAccessor<int, mode::read>          a_cellIndex,
+                                      int                                      cellOffset,
+                                      int                                      numAtomsPerCell,
+                                      int                                      columnsOffset)
+{
+    cgh.require(a_xq);
+    cgh.require(a_x);
+    cgh.require(a_atomIndex);
+    cgh.require(a_numAtoms);
+    cgh.require(a_cellIndex);
+
+    return [=](cl::sycl::id<2> itemIdx) {
+        // Map cell-level parallelism to y component of block index.
+        const int cxy = itemIdx.get(1) + columnsOffset;
+
+        const int numAtoms = a_numAtoms[cxy];
+        const int offset   = (cellOffset + a_cellIndex[cxy]) * numAtomsPerCell;
+
+        const int threadIndex = itemIdx.get(0);
+
+        // Perform layout conversion of each element.
+        if (threadIndex < numAtoms)
+        {
+            const float  q             = a_xq[threadIndex + offset][3];
+            const Float3 xNew          = a_x[a_atomIndex[threadIndex + offset]];
+            a_xq[threadIndex + offset] = Float4(xNew[0], xNew[1], xNew[2], q);
+        }
+    };
+}
+
+// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
+class NbnxmKernelTransformXToXqName;
+
+void launchNbnxmKernelTransformXToXq(const Grid&          grid,
+                                     NbnxmGpu*            nb,
+                                     DeviceBuffer<Float3> d_x,
+                                     const DeviceStream&  deviceStream,
+                                     unsigned int         numColumnsMax,
+                                     int                  gridId)
+{
+    const unsigned int numColumns  = grid.numColumns();
+    const unsigned int numAtomsMax = grid.numCellsColumnMax() * grid.numAtomsPerCell();
+    GMX_ASSERT(numColumns <= numColumnsMax, "Grid has more columns than allowed");
+
+    const cl::sycl::range<2> globalSize{ numAtomsMax, numColumns };
+    cl::sycl::queue          q = deviceStream.stream();
+
+    q.submit([&](cl::sycl::handler& cgh) {
+        auto kernel = nbnxmKernelTransformXToXq(cgh,
+                                                nb->atdat->xq,
+                                                d_x,
+                                                nb->atomIndices,
+                                                nb->cxy_na,
+                                                nb->cxy_ind,
+                                                grid.cellOffset(),
+                                                grid.numAtomsPerCell(),
+                                                numColumnsMax * gridId);
+        cgh.parallel_for<NbnxmKernelTransformXToXqName>(globalSize, kernel);
+    });
+}
+
+} // namespace Nbnxm