From bdd6710f2eb42594dcc4b885725b8e36362caebc Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Tue, 11 May 2021 07:16:38 +0000 Subject: [PATCH] Add SYCL implementation of GPU F buffer operations --- src/gromacs/mdlib/CMakeLists.txt | 5 + src/gromacs/mdlib/gpuforcereduction.h | 2 +- src/gromacs/mdlib/gpuforcereduction_impl.cpp | 2 +- .../gpuforcereduction_impl_internal_sycl.cpp | 141 ++++++++++++++++++ 4 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt index 060f6160d7..511a946809 100644 --- a/src/gromacs/mdlib/CMakeLists.txt +++ b/src/gromacs/mdlib/CMakeLists.txt @@ -39,6 +39,7 @@ file(GLOB MDLIB_SOURCES *.cpp) # To avoid listing all the necessary files manually, we will remove SYCL-specific files here: list(REMOVE_ITEM MDLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl_internal_sycl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp @@ -66,6 +67,8 @@ endif() if(GMX_GPU_SYCL) gmx_add_libgromacs_sources( + gpuforcereduction_impl.cpp + gpuforcereduction_impl_internal_sycl.cpp leapfrog_gpu_sycl.cpp lincs_gpu.cpp lincs_gpu_internal_sycl.cpp @@ -74,6 +77,8 @@ if(GMX_GPU_SYCL) ) _gmx_add_files_to_property(SYCL_SOURCES + gpuforcereduction_impl.cpp + gpuforcereduction_impl_internal_sycl.cpp leapfrog_gpu_sycl.cpp lincs_gpu.cpp lincs_gpu_internal_sycl.cpp diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h index e015d7ef5e..922b8fecaf 100644 --- a/src/gromacs/mdlib/gpuforcereduction.h +++ b/src/gromacs/mdlib/gpuforcereduction.h @@ -60,7 +60,7 @@ class DeviceContext; namespace gmx { -#define HAVE_GPU_FORCE_REDUCTION (GMX_GPU_CUDA) +#define HAVE_GPU_FORCE_REDUCTION (GMX_GPU_CUDA || GMX_GPU_SYCL) /*! \internal * \brief Manages the force reduction directly in GPU memory diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp index fb58c5c943..93772853d5 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -77,7 +77,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer baseForcePtr, const bool accumulate, GpuEventSynchronizer* completionMarker) { - GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data"); + GMX_ASSERT(baseForcePtr, "Input base force for reduction has no data"); baseForce_ = baseForcePtr; numAtoms_ = numAtoms; atomStart_ = atomStart; diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp new file mode 100644 index 0000000000..3bc21c43de --- /dev/null +++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp @@ -0,0 +1,141 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements GPU Force Reduction using SYCL + * + * \author Alan Gray + * \author Andrey Alekseenko + * + * \ingroup module_mdlib + */ + +#include "gmxpre.h" + +#include "gpuforcereduction_impl_internal.h" + +#include + +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h" +#include "gromacs/utility/template_mp.h" + +namespace gmx +{ + +using cl::sycl::access::mode; + +template +static auto reduceKernel(cl::sycl::handler& cgh, + DeviceAccessor a_nbnxmForce, + OptionalAccessor a_rvecForceToAdd, + DeviceAccessor a_forceTotal, + DeviceAccessor a_cell, + const int atomStart) +{ + cgh.require(a_nbnxmForce); + if constexpr (addRvecForce) + { + cgh.require(a_rvecForceToAdd); + } + cgh.require(a_forceTotal); + cgh.require(a_cell); + + return [=](cl::sycl::id<1> itemIdx) { + // Set to nbnxnm force, then perhaps accumulate further to it + Float3 temp = a_nbnxmForce[a_cell[itemIdx]]; + + if constexpr (accumulateForce) + { + temp += a_forceTotal[itemIdx + atomStart]; + } + + if constexpr (addRvecForce) + { + temp += a_rvecForceToAdd[itemIdx + atomStart]; + } + + a_forceTotal[itemIdx + atomStart] = temp; + }; +} + +template +class ReduceKernelName; + +template +static void launchReductionKernel_(const int numAtoms, + const int atomStart, + const DeviceBuffer& b_nbnxmForce, + const DeviceBuffer& b_rvecForceToAdd, + DeviceBuffer& b_forceTotal, + const DeviceBuffer& b_cell, + const DeviceStream& deviceStream) +{ + const cl::sycl::range<1> rangeNumAtoms(numAtoms); + cl::sycl::queue queue = deviceStream.stream(); + + // We only need parts of b_rvecForceToAdd and b_forceTotal, so sub-buffers would be appropriate. + // But hipSYCL does not support them yet, nor plans to. See Issue #4019. + + queue.submit([&](cl::sycl::handler& cgh) { + auto kernel = reduceKernel( + cgh, b_nbnxmForce, b_rvecForceToAdd, b_forceTotal, b_cell, atomStart); + cgh.parallel_for>(rangeNumAtoms, kernel); + }); +} + +/*! \brief Select templated kernel and launch it. */ +void launchForceReductionKernel(int numAtoms, + int atomStart, + bool addRvecForce, + bool accumulate, + DeviceBuffer d_nbnxmForceToAdd, + DeviceBuffer d_rvecForceToAdd, + DeviceBuffer d_baseForce, + DeviceBuffer d_cell, + const DeviceStream& deviceStream) +{ + dispatchTemplatedFunction( + [&](auto addRvecForce_, auto accumulateForce_) { + return launchReductionKernel_( + numAtoms, atomStart, d_nbnxmForceToAdd, d_rvecForceToAdd, d_baseForce, d_cell, deviceStream); + }, + addRvecForce, + accumulate); +} + +} // namespace gmx -- 2.22.0