From: Andrey Alekseenko Date: Thu, 18 Feb 2021 11:51:28 +0000 (+0000) Subject: SYCL NBNXM offload support X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=b8fb87a0c5d65014d97e618241a0075875a81199;p=alexxy%2Fgromacs.git SYCL NBNXM offload support Associated changes: - Added function stubs to PME: necessary for compilation. - Stricter SYCL hardware compatibility checks: limits on subgroup size and the availability of local memory. - The kernel implementation and overall logic closely follow the OpenCL implementation. Divergences are documented locally. Limitations: - No fine-grained timings yet. - Code-duplication with CUDA and OpenCL: see #2608. - Minor differences in local/nonlocal synchronization: see #3895, related to #2608. - Only the OpenCL backend was extensively tested. LevelZero works fine without MPI but stalls due to a known bug. The fix for DPCPP runtime is available, but not yet part of any OneAPI release: https://github.com/intel/llvm/pull/3045. - The complex/position-restraints regression test fails: see #3846. - No performance tuning: see #3847. Performance on rnase-cubic system is similar to OpenCL implementation. --- diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt index 5699d52651..ac50cc1a48 100644 --- a/src/gromacs/ewald/CMakeLists.txt +++ b/src/gromacs/ewald/CMakeLists.txt @@ -2,7 +2,7 @@ # This file is part of the GROMACS molecular simulation package. # # Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team. -# Copyright (c) 2019,2020, by the GROMACS development team, led by +# Copyright (c) 2019,2020,2021, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -76,7 +76,7 @@ if (GMX_GPU_CUDA) # Must add these files so they can include device_information.h pme_gpu_internal.cpp pme_gpu_timings.cpp - ) + ) elseif (GMX_GPU_OPENCL) gmx_add_libgromacs_sources( # OpenCL-specific sources @@ -90,11 +90,18 @@ elseif (GMX_GPU_OPENCL) elseif (GMX_GPU_SYCL) # SYCL-TODO: proper implementation gmx_add_libgromacs_sources( - pme_gpu_program_impl.cpp + # Files that implement stubs + pme_gpu_sycl_stubs.cpp + # GPU-specific sources + pme_gpu.cpp + pme_gpu_internal.cpp + pme_gpu_timings.cpp ) _gmx_add_files_to_property(SYCL_SOURCES - pme_gpu_program_impl.cpp + pme_gpu_internal.cpp pme_gpu_program.cpp + pme_gpu_sycl_stubs.cpp + pme_gpu_timings.cpp ) else() gmx_add_libgromacs_sources( diff --git a/src/gromacs/ewald/pme_gpu_3dfft.h b/src/gromacs/ewald/pme_gpu_3dfft.h index a39f751bab..d71e43522c 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft.h +++ b/src/gromacs/ewald/pme_gpu_3dfft.h @@ -1,7 +1,8 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team. + * Copyright (c) 2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -56,6 +57,8 @@ # include "gromacs/gpu_utils/gmxopencl.h" # include "gromacs/gpu_utils/gputraits_ocl.h" +#elif GMX_GPU_SYCL +# include "gromacs/gpu_utils/gputraits_sycl.h" #endif #include "gromacs/fft/fft.h" // for the enum gmx_fft_direction diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index e78373ec8b..0d7ee39e23 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -1,7 +1,8 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team. + * Copyright (c) 2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -75,6 +76,8 @@ # include "pme.cuh" #elif GMX_GPU_OPENCL # include "gromacs/gpu_utils/gmxopencl.h" +#elif GMX_GPU_SYCL +# include "gromacs/gpu_utils/syclutils.h" #endif #include "gromacs/ewald/pme.h" @@ -468,7 +471,7 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu) kernelParamsPtr->fractShiftsTableTexture); destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable, kernelParamsPtr->gridlineIndicesTableTexture); -#elif GMX_GPU_OPENCL +#elif GMX_GPU_OPENCL || GMX_GPU_SYCL freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable); freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable); #endif diff --git a/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp b/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp new file mode 100644 index 0000000000..c6f6c1c303 --- /dev/null +++ b/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp @@ -0,0 +1,79 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements stubs of high-level PME GPU functions for SYCL. + * + * \author Andrey Alekseenko + * + * \ingroup module_ewald + */ +#include "gmxpre.h" + +#include "gromacs/ewald/ewald_utils.h" + +#include "pme_gpu_3dfft.h" +#include "pme_gpu_internal.h" +#include "pme_gpu_program_impl.h" + +PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) : + deviceContext_(deviceContext), + warpSize_(0), + spreadWorkGroupSize(0), + gatherWorkGroupSize(0), + solveMaxWorkGroupSize(0) +{ + // SYCL-TODO +} + +PmeGpuProgramImpl::~PmeGpuProgramImpl() = default; + +// [[noreturn]] attributes must be added in the common headers, so it's easier to silence the warning here +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmissing-noreturn" + +GpuParallel3dFft::GpuParallel3dFft(PmeGpu const* /*pmeGpu*/, int /*gridIndex*/) +{ + GMX_THROW(gmx::NotImplementedError("PME is not implemented in SYCL")); +} + +GpuParallel3dFft::~GpuParallel3dFft() = default; + +void GpuParallel3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/) +{ + GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet")); +} + +#pragma clang diagnostic pop diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h index 26405433f5..055e61e5d8 100644 --- a/src/gromacs/ewald/pme_gpu_types_host.h +++ b/src/gromacs/ewald/pme_gpu_types_host.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -78,7 +78,7 @@ typedef int PmeGpuSpecific; struct PmeGpuCudaKernelParams; /*! \brief A typedef for including the GPU kernel arguments data by pointer */ typedef PmeGpuCudaKernelParams PmeGpuKernelParams; -#elif GMX_GPU_OPENCL +#elif GMX_GPU_OPENCL || GMX_GPU_SYCL struct PmeGpuKernelParamsBase; /*! \brief A typedef for including the GPU kernel arguments data by pointer */ typedef PmeGpuKernelParamsBase PmeGpuKernelParams; diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h index cf98d701c5..976e064cb2 100644 --- a/src/gromacs/ewald/pme_gpu_types_host_impl.h +++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -56,6 +56,9 @@ #elif GMX_GPU_OPENCL # include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h" # include "gromacs/gpu_utils/gpuregiontimer_ocl.h" +#elif GMX_GPU_SYCL +# include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h" +# include "gromacs/gpu_utils/gpuregiontimer_sycl.h" #endif #include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt index 7958f18c13..3b59b9279e 100644 --- a/src/gromacs/gpu_utils/CMakeLists.txt +++ b/src/gromacs/gpu_utils/CMakeLists.txt @@ -1,7 +1,8 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by +# Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team. +# Copyright (c) 2020,2021, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -62,12 +63,13 @@ elseif(GMX_GPU_CUDA) ) _gmx_add_files_to_property(CUDA_SOURCES device_stream_manager.cpp - ) + ) elseif(GMX_GPU_SYCL) gmx_add_libgromacs_sources( devicebuffer_sycl.cpp device_context_sycl.cpp device_stream_sycl.cpp + syclutils.cpp ) _gmx_add_files_to_property(SYCL_SOURCES devicebuffer_sycl.cpp @@ -75,6 +77,7 @@ elseif(GMX_GPU_SYCL) device_context_sycl.cpp device_stream_manager.cpp device_stream_sycl.cpp + syclutils.cpp ) else() gmx_add_libgromacs_sources( diff --git a/src/gromacs/gpu_utils/gpu_macros.h b/src/gromacs/gpu_utils/gpu_macros.h index 1bfbefd0a7..bf57d57d3b 100644 --- a/src/gromacs/gpu_utils/gpu_macros.h +++ b/src/gromacs/gpu_utils/gpu_macros.h @@ -1,7 +1,8 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team. + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -73,11 +74,15 @@ # define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT # define OPENCL_FUNC_TERM REAL_FUNC_TERM # define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg) +# define SYCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER +# define SYCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT +# define SYCL_FUNC_TERM REAL_FUNC_TERM +# define SYCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg) #else // Not DOXYGEN /* GPU support is enabled, so these functions will have real code defined somewhere */ -# if GMX_GPU && !GMX_GPU_SYCL +# if GMX_GPU # define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER # define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT # define GPU_FUNC_TERM REAL_FUNC_TERM @@ -87,7 +92,6 @@ # define GPU_FUNC_ARGUMENT NULL_FUNC_ARGUMENT # define GPU_FUNC_TERM NULL_FUNC_TERM # define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg) -# # endif /* Enable and disable platform-specific function implementations */ @@ -115,6 +119,18 @@ # define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg) # endif +# if GMX_GPU_SYCL +# define SYCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER +# define SYCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT +# define SYCL_FUNC_TERM REAL_FUNC_TERM +# define SYCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg) +# else +# define SYCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER +# define SYCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT +# define SYCL_FUNC_TERM NULL_FUNC_TERM +# define SYCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg) +# endif + #endif // ifdef DOXYGEN #endif // GMX_GPU_UTILS_MACROS_H diff --git a/src/gromacs/gpu_utils/gpu_utils.cpp b/src/gromacs/gpu_utils/gpu_utils.cpp index eb26ce0a8e..663137084f 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cpp +++ b/src/gromacs/gpu_utils/gpu_utils.cpp @@ -1,7 +1,8 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team. + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -43,10 +44,7 @@ #include "config.h" -#include - #include "gromacs/utility/arrayref.h" -#include "gromacs/utility/smalloc.h" #include "gromacs/utility/stringutil.h" #ifdef _MSC_VER @@ -79,9 +77,5 @@ bool buildSupportsNonbondedOnGpu(std::string* error) { errorReasons.emplace_back("non-GPU build of GROMACS"); } - if (GMX_GPU_SYCL) - { - errorReasons.emplace_back("SYCL build of GROMACS"); - } return addMessageIfNotSupported(errorReasons, error); } diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h index 52e40bb836..d53552e3c2 100644 --- a/src/gromacs/gpu_utils/gputraits_sycl.h +++ b/src/gromacs/gpu_utils/gputraits_sycl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -47,6 +47,7 @@ #include #include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/math/vectypes.h" using DeviceTexture = void*; diff --git a/src/gromacs/gpu_utils/syclutils.cpp b/src/gromacs/gpu_utils/syclutils.cpp new file mode 100644 index 0000000000..0f660c04f5 --- /dev/null +++ b/src/gromacs/gpu_utils/syclutils.cpp @@ -0,0 +1,78 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief Define utility routines for SYCL + * + * \author Andrey Alekseenko + */ +#include "gmxpre.h" + +#include "syclutils.h" + +#include "gromacs/utility/smalloc.h" + +/*! \brief Allocates \p nbytes of host memory. Use \c pfree to free memory allocated with this function. + * + * \todo + * This function was copied from OpenCL implementation, not tuned for SYCL at all. + * Once SYCL2020 is out, might be worthwhile to look into USM and sycl::malloc_host / sycl::aligned_alloc_host. + * Overall, it is better to directly use sycl::buffer instead of pinned arrays. But this function + * is needed to compile some PME code with SYCL enabled, even if it is never used. + * + * \param[in,out] h_ptr Pointer where to store the address of the newly allocated buffer. + * \param[in] nbytes Size in bytes of the buffer to be allocated. + */ +void pmalloc(void** h_ptr, size_t nbytes) +{ + /* Need a temporary type whose size is 1 byte, so that the + * implementation of snew_aligned can cope without issuing + * warnings. */ + auto** temporary = reinterpret_cast(h_ptr); + + /* 16-byte alignment inherited from OpenCL and does not sound unreasonable */ + snew_aligned(*temporary, nbytes, 16); +} + +/*! \brief Frees memory allocated with pmalloc. + * + * \param[in] h_ptr Buffer allocated with pmalloc that needs to be freed. + */ +void pfree(void* h_ptr) +{ + if (h_ptr) + { + sfree_aligned(h_ptr); + } +} diff --git a/src/gromacs/gpu_utils/syclutils.h b/src/gromacs/gpu_utils/syclutils.h new file mode 100644 index 0000000000..1318a2a9b4 --- /dev/null +++ b/src/gromacs/gpu_utils/syclutils.h @@ -0,0 +1,129 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * \brief Declare utility routines for SYCL + * + * \author Andrey Alekseenko + * \inlibraryapi + */ +#ifndef GMX_GPU_UTILS_SYCLUTILS_H +#define GMX_GPU_UTILS_SYCLUTILS_H + +#include + +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/stringutil.h" + +class DeviceStream; +enum class GpuApiCallBehavior; + +/*! \internal + * \brief SYCL GPU runtime data + * + * The device runtime data is meant to hold objects associated with a GROMACS rank's + * (thread or process) use of a single device (multiple devices per rank is not + * implemented). These objects should be constructed at the point where a device + * gets assigned to a rank and released at when this assignment is no longer valid + * (i.e. at cleanup in the current implementation). + */ +struct gmx_device_runtime_data_t +{ +}; + +#ifndef DOXYGEN + +/*! \brief Allocate host memory in malloc style */ +void pmalloc(void** h_ptr, size_t nbytes); + +/*! \brief Free host memory in malloc style */ +void pfree(void* h_ptr); + +/* To properly mark function as [[noreturn]], we must do it everywhere it is declared, which + * will pollute common headers.*/ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wmissing-noreturn" + +/*! \brief + * A wrapper function for setting up all the SYCL kernel arguments. + * Calls the recursive functions above. + * + * \tparam Args Types of all the kernel arguments + * \param[in] kernel Kernel function handle + * \param[in] config Kernel configuration for launching + * \param[in] argsPtrs Pointers to all the kernel arguments + * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument. + */ +template +void* prepareGpuKernelArguments(void* /*kernel*/, const KernelLaunchConfig& /*config*/, const Args*... /*argsPtrs*/) +{ + GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet")); +} + +/*! \brief Launches the SYCL kernel and handles the errors. + * + * \param[in] kernel Kernel function handle + * \param[in] config Kernel configuration for launching + * \param[in] deviceStream GPU stream to launch kernel in + * \param[in] timingEvent Timing event, fetched from GpuRegionTimer + * \param[in] kernelName Human readable kernel description, for error handling only + * \throws gmx::InternalError on kernel launch failure + */ +inline void launchGpuKernel(void* /*kernel*/, + const KernelLaunchConfig& /*config*/, + const DeviceStream& /*deviceStream*/, + CommandEvent* /*timingEvent*/, + const char* /*kernelName*/, + const void* /*kernelArgs*/) +{ + GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet")); +} + +/*! \brief Pretend to check a SYCL stream for unfinished work (dummy implementation). + * + * \returns Not implemented in SYCL. + */ +static inline bool haveStreamTasksCompleted(const DeviceStream& /* deviceStream */) +{ + GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet")); +} + +# pragma clang diagnostic pop + +#endif // !DOXYGEN + +#endif diff --git a/src/gromacs/hardware/device_management_sycl.cpp b/src/gromacs/hardware/device_management_sycl.cpp index 93021e418f..4fd401e04b 100644 --- a/src/gromacs/hardware/device_management_sycl.cpp +++ b/src/gromacs/hardware/device_management_sycl.cpp @@ -92,14 +92,43 @@ static DeviceStatus isDeviceCompatible(const cl::sycl::device& syclDevice) return DeviceStatus::Compatible; } - if (syclDevice.is_accelerator()) // FPGAs and FPGA emulators + if (syclDevice.get_info() == cl::sycl::info::local_mem_type::none) { + // While some kernels (leapfrog) can run without shared/local memory, this is a bad sign return DeviceStatus::Incompatible; } - else + + if (syclDevice.is_host()) + { + // Host device does not support subgroups or even querying for sub_group_sizes + return DeviceStatus::Incompatible; + } + const std::vector supportedSubGroupSizes = + syclDevice.get_info(); + const size_t requiredSubGroupSizeForNBNXM = 8; + if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM) + == supportedSubGroupSizes.end()) + { + return DeviceStatus::IncompatibleClusterSize; + } + + /* Host device can not be used, because NBNXM requires sub-groups, which are not supported. + * Accelerators (FPGAs and their emulators) are not supported. + * So, the only viable options are CPUs and GPUs. */ + const bool forceCpu = (getenv("GMX_SYCL_FORCE_CPU") != nullptr); + + if (forceCpu && syclDevice.is_cpu()) + { + return DeviceStatus::Compatible; + } + else if (!forceCpu && syclDevice.is_gpu()) { return DeviceStatus::Compatible; } + else + { + return DeviceStatus::Incompatible; + } } diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp index ff62325723..ebe0f95085 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cpp +++ b/src/gromacs/listed_forces/gpubonded_impl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -124,6 +124,10 @@ bool buildSupportsGpuBondeds(std::string* error) { errorReasons.emplace_back("not supported with OpenCL build of GROMACS"); } + if (GMX_GPU_SYCL) + { + errorReasons.emplace_back("not supported with SYCL build of GROMACS"); + } else if (!GMX_GPU) { errorReasons.emplace_back("not supported with CPU-only build of GROMACS"); diff --git a/src/gromacs/nbnxm/CMakeLists.txt b/src/gromacs/nbnxm/CMakeLists.txt index bca3f88df9..91738e6235 100644 --- a/src/gromacs/nbnxm/CMakeLists.txt +++ b/src/gromacs/nbnxm/CMakeLists.txt @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2019,2020, by the GROMACS development team, led by +# Copyright (c) 2019,2020,2021, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -69,7 +69,7 @@ if(GMX_GPU_CUDA) gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) _gmx_add_files_to_property(CUDA_SOURCES nbnxm_gpu_data_mgmt.cpp - ) + ) endif() if(GMX_GPU_OPENCL) @@ -78,6 +78,12 @@ if(GMX_GPU_OPENCL) gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) endif() +if(GMX_GPU_SYCL) + add_subdirectory(sycl) + gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) + _gmx_add_files_to_property(SYCL_SOURCES nbnxm_gpu_data_mgmt.cpp nbnxm.cpp) +endif() + set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${NBNXM_SOURCES} PARENT_SCOPE) @@ -103,4 +109,4 @@ target_link_libraries(nbnxm INTERFACE #target_link_libraries(nbnxm PUBLIC target_link_libraries(nbnxm INTERFACE utility - ) \ No newline at end of file + ) diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index 9b383d2ec6..bb1eb874ae 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -55,6 +55,10 @@ # include "opencl/nbnxm_ocl_types.h" #endif +#if GMX_GPU_SYCL +# include "sycl/nbnxm_sycl_types.h" +#endif + #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/listed_forces/gpubonded.h" #include "gromacs/math/vec.h" @@ -246,8 +250,7 @@ static void countPruneKernelTime(GpuTimers* timers, * \param[out] e_el Variable to accumulate electrostatic energy into * \param[out] fshift Pointer to the array of shift forces to accumulate into */ -template -static inline void gpu_reduce_staged_outputs(const StagingData& nbst, +static inline void gpu_reduce_staged_outputs(const nb_staging_t& nbst, const InteractionLocality iLocality, const bool reduceEnergies, const bool reduceFshift, @@ -435,9 +438,12 @@ bool gpu_try_finish_task(NbnxmGpu* nb, } } - /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */ - nb->timers->interaction[iLocality].didPrune = - nb->timers->interaction[iLocality].didRollingPrune = false; + /* Reset both pruning flags. */ + if (nb->bDoTime) + { + nb->timers->interaction[iLocality].didPrune = + nb->timers->interaction[iLocality].didRollingPrune = false; + } /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */ nb->plist[iLocality]->haveFreshList = false; diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h index 2d2e90791e..26cd3c165f 100644 --- a/src/gromacs/nbnxm/gpu_types_common.h +++ b/src/gromacs/nbnxm/gpu_types_common.h @@ -44,6 +44,7 @@ #include "config.h" +#include "gromacs/mdtypes/interaction_const.h" #include "gromacs/mdtypes/locality.h" #include "gromacs/utility/enumerationhelpers.h" @@ -58,6 +59,10 @@ # include "gromacs/gpu_utils/gpuregiontimer.cuh" #endif +#if GMX_GPU_SYCL +# include "gromacs/gpu_utils/gpuregiontimer_sycl.h" +#endif + /** \internal * \brief Parameters required for the GPU nonbonded calculations. */ @@ -77,7 +82,7 @@ struct NBParamGpu float two_k_rf; //! Ewald/PME parameter float ewald_beta; - //! Ewald/PME correction term substracted from the direct-space potential + //! Ewald/PME correction term subtracted from the direct-space potential float sh_ewald; //! LJ-Ewald/PME correction term added to the correction potential float sh_lj_ewald; diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index 9fe755c586..f811eee3e5 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -56,6 +56,10 @@ # include "opencl/nbnxm_ocl_types.h" #endif +#if GMX_GPU_SYCL +# include "sycl/nbnxm_sycl_types.h" +#endif + #include "nbnxm_gpu_data_mgmt.h" #include "gromacs/hardware/device_information.h" @@ -63,6 +67,7 @@ #include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/fatalerror.h" #include "nbnxm_gpu.h" #include "pairlistsets.h" diff --git a/src/gromacs/nbnxm/pairlistparams.h b/src/gromacs/nbnxm/pairlistparams.h index 2783ae454d..72bed6748b 100644 --- a/src/gromacs/nbnxm/pairlistparams.h +++ b/src/gromacs/nbnxm/pairlistparams.h @@ -62,6 +62,8 @@ static constexpr int c_nbnxnCpuIClusterSize = 4; //! The i- and j-cluster size for GPU lists, 8 atoms for CUDA, set at compile time for OpenCL #if GMX_GPU_OPENCL static constexpr int c_nbnxnGpuClusterSize = GMX_OPENCL_NB_CLUSTER_SIZE; +#elif GMX_GPU_SYCL +static constexpr int c_nbnxnGpuClusterSize = 4; #else static constexpr int c_nbnxnGpuClusterSize = 8; #endif diff --git a/src/gromacs/nbnxm/sycl/CMakeLists.txt b/src/gromacs/nbnxm/sycl/CMakeLists.txt new file mode 100644 index 0000000000..4a6cc29b19 --- /dev/null +++ b/src/gromacs/nbnxm/sycl/CMakeLists.txt @@ -0,0 +1,39 @@ +# +# This file is part of the GROMACS molecular simulation package. +# +# Copyright (c) 2020,2021, by the GROMACS development team, led by +# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, +# and including many others, as listed in the AUTHORS file in the +# top-level source directory and at http://www.gromacs.org. +# +# GROMACS is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public License +# as published by the Free Software Foundation; either version 2.1 +# of the License, or (at your option) any later version. +# +# GROMACS is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with GROMACS; if not, see +# http://www.gnu.org/licenses, or write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# If you want to redistribute modifications to GROMACS, please +# consider that scientific software is very special. Version +# control is crucial - bugs must be traceable. We will be happy to +# consider code for inclusion in the official distribution, but +# derived work must not be called official GROMACS. Details are found +# in the README & COPYING files - if they are missing, get the +# official version at http://www.gromacs.org. +# +# To help us fund GROMACS development, we humbly ask that you cite +# the research papers on the package. Check out http://www.gromacs.org. + +if (GMX_GPU_SYCL) + file(GLOB NBNXM_SYCL_SOURCES *.cpp) + set(NBNXM_SOURCES ${NBNXM_SOURCES} ${NBNXM_SYCL_SOURCES} PARENT_SCOPE) + _gmx_add_files_to_property(SYCL_SOURCES ${NBNXM_SYCL_SOURCES}) +endif() diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp new file mode 100644 index 0000000000..c4a2d3f725 --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp @@ -0,0 +1,269 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Data management and kernel launch functions for nbnxm sycl. + * + * \ingroup module_nbnxm + */ +#include "gmxpre.h" + +#include "gromacs/nbnxm/gpu_common.h" +#include "gromacs/utility/exceptions.h" + +#include "nbnxm_sycl_kernel.h" +#include "nbnxm_sycl_kernel_pruneonly.h" +#include "nbnxm_sycl_types.h" + +namespace Nbnxm +{ + +/*! \brief + * Launch asynchronously the download of nonbonded forces from the GPU + * (and energies/shift forces if required). + */ +void gpu_launch_cpyback(NbnxmGpu* nb, + struct nbnxn_atomdata_t* nbatom, + const gmx::StepWorkload& stepWork, + const AtomLocality atomLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); + const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; + sycl_atomdata_t* adat = nb->atdat; + + /* don't launch non-local copy-back if there was no non-local work to do */ + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) + { + nb->bNonLocalStreamActive = false; + return; + } + + int adatBegin, adatLen; + getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen); + + // With DD the local D2H transfer can only start after the non-local kernel has finished. + if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive) + { + nb->nonlocal_done.waitForEvent(); + } + + /* DtoH f + * Skip if buffer ops / reduction is offloaded to the GPU. + */ + if (!stepWork.useGpuFBufferOps) + { + GMX_ASSERT(adat->f.elementSize() == sizeof(float3), + "The size of the force buffer element should be equal to the size of float3."); + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adatBegin, + &adat->f, + adatBegin, + adatLen, + deviceStream, + GpuApiCallBehavior::Async, + nullptr); + } + + /* After the non-local D2H is launched the nonlocal_done event can be + recorded which signals that the local D2H can proceed. This event is not + placed after the non-local kernel because we want the non-local data + back first. */ + if (iloc == InteractionLocality::NonLocal) + { + nb->nonlocal_done.markEvent(deviceStream); + nb->bNonLocalStreamActive = true; + } + + /* only transfer energies in the local stream */ + if (iloc == InteractionLocality::Local) + { + /* DtoH fshift when virial is needed */ + if (stepWork.computeVirial) + { + GMX_ASSERT(sizeof(*nb->nbst.fshift) == adat->fShift.elementSize(), + "Sizes of host- and device-side shift vector elements should be the same."); + copyFromDeviceBuffer( + nb->nbst.fshift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr); + } + + /* DtoH energies */ + if (stepWork.computeEnergy) + { + GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float), + "Sizes of host- and device-side LJ energy terms should be the same."); + copyFromDeviceBuffer( + nb->nbst.e_lj, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); + GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float), + "Sizes of host- and device-side electrostatic energy terms should be the " + "same."); + copyFromDeviceBuffer( + nb->nbst.e_el, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); + } + } +} + +/*! \brief Launch asynchronously the xq buffer host to device copy. */ +void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + validateGpuAtomLocality(atomLocality); + + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); + + sycl_atomdata_t* adat = nb->atdat; + gpu_plist* plist = nb->plist[iloc]; + const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; + + /* Don't launch the non-local H2D copy if there is no dependent + work to do: neither non-local nor other (e.g. bonded) work + to do that has as input the nbnxn coordinates. + Doing the same for the local kernel is more complicated, since the + local part of the force array also depends on the non-local kernel. + So to avoid complicating the code and to reduce the risk of bugs, + we always call the local local x+q copy (and the rest of the local + work in nbnxn_gpu_launch_kernel(). + */ + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) + { + plist->haveFreshList = false; + return; + } + + int adatBegin, adatLen; + getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen); + + /* HtoD x, q */ + GMX_ASSERT(adat->xq.elementSize() == sizeof(float4), + "The size of the xyzq buffer element should be equal to the size of float4."); + copyToDeviceBuffer(&adat->xq, + reinterpret_cast(nbatom->x().data()) + adatBegin, + adatBegin, + adatLen, + deviceStream, + GpuApiCallBehavior::Async, + nullptr); + + /* No need to enforce stream synchronization with events like we do in CUDA/OpenCL. + * Runtime should do the scheduling correctly based on data dependencies. */ +} + +void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) +{ + gpu_plist* plist = nb->plist[iloc]; + + if (plist->haveFreshList) + { + GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part"); + + /* Set rollingPruningNumParts to signal that it is not set */ + plist->rollingPruningNumParts = 0; + plist->rollingPruningPart = 0; + } + else + { + if (plist->rollingPruningNumParts == 0) + { + plist->rollingPruningNumParts = numParts; + } + else + { + GMX_ASSERT(numParts == plist->rollingPruningNumParts, + "It is not allowed to change numParts in between list generation steps"); + } + } + + /* Use a local variable for part and update in plist, so we can return here + * without duplicating the part increment code. + */ + const int part = plist->rollingPruningPart; + + plist->rollingPruningPart++; + if (plist->rollingPruningPart >= plist->rollingPruningNumParts) + { + plist->rollingPruningPart = 0; + } + + /* Compute the number of list entries to prune in this pass */ + const int numSciInPart = (plist->nsci - part) / numParts; + + /* Don't launch the kernel if there is no work to do */ + if (numSciInPart <= 0) + { + plist->haveFreshList = false; + return; + } + + launchNbnxmKernelPruneOnly(nb, iloc, numParts, part, numSciInPart); + + if (plist->haveFreshList) + { + plist->haveFreshList = false; + nb->didPrune[iloc] = true; // Mark that pruning has been done + } + else + { + nb->didRollingPrune[iloc] = true; // Mark that rolling pruning has been done + } +} + +void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc) +{ + const NBParamGpu* nbp = nb->nbparam; + gpu_plist* plist = nb->plist[iloc]; + + if (canSkipNonbondedWork(*nb, iloc)) + { + plist->haveFreshList = false; + return; + } + + if (nbp->useDynamicPruning && plist->haveFreshList) + { + // Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false + gpu_launch_kernel_pruneonly(nb, iloc, 1); + } + + if (plist->nsci == 0) + { + /* Don't launch an empty local kernel */ + return; + } + + launchNbnxmKernel(nb, stepWork, iloc); +} + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp new file mode 100644 index 0000000000..e35b974c3a --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp @@ -0,0 +1,354 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Stubs of functions that must be defined by nbnxm sycl implementation. + * + * \ingroup module_nbnxm + */ +#include "gmxpre.h" + +#include "gromacs/gpu_utils/device_stream_manager.h" +#include "gromacs/hardware/device_information.h" +#include "gromacs/mdtypes/interaction_const.h" +#include "gromacs/nbnxm/atomdata.h" +#include "gromacs/nbnxm/gpu_data_mgmt.h" +#include "gromacs/nbnxm/nbnxm_gpu.h" +#include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h" +#include "gromacs/pbcutil/ishift.h" +#include "gromacs/utility/exceptions.h" + +#include "nbnxm_sycl_types.h" + +namespace Nbnxm +{ + +//! This function is documented in the header file +void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) +{ + sycl_atomdata_t* adat = nb->atdat; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + // Clear forces + clearDeviceBufferAsync(&adat->f, 0, nb->atdat->natoms, localStream); + // Clear shift force array and energies if the outputs were used in the current step + if (computeVirial) + { + clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream); + clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream); + clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream); + } +} + +/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */ +static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext) +{ + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + sycl_atomdata_t* atomdata = nb->atdat; + atomdata->numTypes = numTypes; + allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext); + atomdata->shiftVecUploaded = false; + + allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext); + allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext); + allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext); + + clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream); + clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream); + clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream); + + /* initialize to nullptr pointers to data that is not allocated here and will + need reallocation in later */ + atomdata->xq = nullptr; + atomdata->f = nullptr; + + /* size -1 indicates that the respective array hasn't been initialized yet */ + atomdata->natoms = -1; + atomdata->numAlloc = -1; +} + +/*! \brief Initialize the nonbonded parameter data structure. */ +static void initNbparam(NBParamGpu* nbp, + const interaction_const_t& ic, + const PairlistParams& listParams, + const nbnxn_atomdata_t::Params& nbatParams, + const DeviceContext& deviceContext) +{ + const int numTypes = nbatParams.numTypes; + + set_cutoff_parameters(nbp, &ic, listParams); + + nbp->vdwType = nbnxmGpuPickVdwKernelType(&ic, nbatParams.ljCombinationRule); + nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(&ic, deviceContext.deviceInfo()); + + /* generate table for PME */ + nbp->coulomb_tab = nullptr; + if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin) + { + GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables"); + init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext); + } + + /* set up LJ parameter lookup table */ + if (!useLjCombRule(nbp->vdwType)) + { + initParamLookupTable( + &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext); + } + + /* set up LJ-PME parameter lookup table */ + if (ic.vdwtype == evdwPME) + { + initParamLookupTable( + &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext); + } +} + +NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, + const interaction_const_t* ic, + const PairlistParams& listParams, + const nbnxn_atomdata_t* nbat, + const bool bLocalAndNonlocal) +{ + auto* nb = new NbnxmGpu(); + nb->deviceContext_ = &deviceStreamManager.context(); + nb->atdat = new sycl_atomdata_t; + nb->nbparam = new NBParamGpu; + nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist; + if (bLocalAndNonlocal) + { + nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist; + } + + nb->bUseTwoStreams = bLocalAndNonlocal; + + nb->timers = nullptr; + nb->timings = nullptr; + + /* init nbst */ + pmalloc(reinterpret_cast(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj)); + pmalloc(reinterpret_cast(&nb->nbst.e_el), sizeof(*nb->nbst.e_el)); + pmalloc(reinterpret_cast(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift)); + + init_plist(nb->plist[InteractionLocality::Local]); + + /* local/non-local GPU streams */ + GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal), + "Local non-bonded stream should be initialized to use GPU for non-bonded."); + nb->deviceStreams[InteractionLocality::Local] = + &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal); + // In general, it's not strictly necessary to use 2 streams for SYCL, since they are + // out-of-order. But for the time being, it will be less disruptive to keep them. + if (nb->bUseTwoStreams) + { + init_plist(nb->plist[InteractionLocality::NonLocal]); + + GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal), + "Non-local non-bonded stream should be initialized to use GPU for " + "non-bonded with domain decomposition."); + nb->deviceStreams[InteractionLocality::NonLocal] = + &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal); + } + + nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer(); + + nb->bDoTime = false; + + const nbnxn_atomdata_t::Params& nbatParams = nbat->params(); + const DeviceContext& deviceContext = *nb->deviceContext_; + + initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext); + initAtomdataFirst(nb, nbatParams.numTypes, deviceContext); + + return nb; +} + +void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) +{ + sycl_atomdata_t* adat = nb->atdat; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + + /* only if we have a dynamic box */ + if (nbatom->bDynamicBox || !adat->shiftVecUploaded) + { + GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]), + "Sizes of host- and device-side shift vectors should be the same."); + copyToDeviceBuffer(&adat->shiftVec, + reinterpret_cast(nbatom->shift_vec.data()), + 0, + SHIFTS, + localStream, + GpuApiCallBehavior::Async, + nullptr); + adat->shiftVecUploaded = true; + } +} + +void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) +{ + GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet"); + sycl_atomdata_t* atdat = nb->atdat; + const DeviceContext& deviceContext = *nb->deviceContext_; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + + int numAtoms = nbat->numAtoms(); + bool reallocated = false; + if (numAtoms > atdat->numAlloc) + { + int numAlloc = over_alloc_small(numAtoms); + + /* free up first if the arrays have already been initialized */ + if (atdat->numAlloc != -1) + { + freeDeviceBuffer(&atdat->f); + freeDeviceBuffer(&atdat->xq); + freeDeviceBuffer(&atdat->atomTypes); + freeDeviceBuffer(&atdat->ljComb); + } + + allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext); + allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext); + if (useLjCombRule(nb->nbparam->vdwType)) + { + allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext); + } + else + { + allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext); + } + + atdat->numAlloc = numAlloc; + reallocated = true; + } + + atdat->natoms = numAtoms; + atdat->natoms_local = nbat->natoms_local; + + /* need to clear GPU f output if realloc happened */ + if (reallocated) + { + clearDeviceBufferAsync(&atdat->f, 0, atdat->numAlloc, localStream); + } + + if (useLjCombRule(nb->nbparam->vdwType)) + { + GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(float2), + "Size of the LJ parameters element should be equal to the size of float2."); + copyToDeviceBuffer(&atdat->ljComb, + reinterpret_cast(nbat->params().lj_comb.data()), + 0, + numAtoms, + localStream, + GpuApiCallBehavior::Async, + nullptr); + } + else + { + GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]), + "Sizes of host- and device-side atom types should be the same."); + copyToDeviceBuffer(&atdat->atomTypes, + nbat->params().type.data(), + 0, + numAtoms, + localStream, + GpuApiCallBehavior::Async, + nullptr); + } +} + +void gpu_free(NbnxmGpu* nb) +{ + if (nb == nullptr) + { + return; + } + + sycl_atomdata_t* atdat = nb->atdat; + NBParamGpu* nbparam = nb->nbparam; + + if ((!nbparam->coulomb_tab) + && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)) + { + destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj); + } + + if (!useLjCombRule(nb->nbparam->vdwType)) + { + destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj); + } + + if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB) + { + destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj); + } + + /* Free plist */ + auto* plist = nb->plist[InteractionLocality::Local]; + delete plist; + if (nb->bUseTwoStreams) + { + auto* plist_nl = nb->plist[InteractionLocality::NonLocal]; + delete plist_nl; + } + + /* Free nbst */ + pfree(nb->nbst.e_lj); + nb->nbst.e_lj = nullptr; + + pfree(nb->nbst.e_el); + nb->nbst.e_el = nullptr; + + pfree(nb->nbst.fshift); + nb->nbst.fshift = nullptr; + + delete atdat; + delete nbparam; + delete nb; +} + +int gpu_min_ci_balanced(NbnxmGpu* nb) +{ + // SYCL-TODO: Logic and magic values taken from OpenCL + static constexpr unsigned int balancedFactor = 50; + if (nb == nullptr) + { + return 0; + } + const cl::sycl::device device = nb->deviceContext_->deviceInfo().syclDevice; + const int numComputeUnits = device.get_info(); + return balancedFactor * numComputeUnits; +} + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp new file mode 100644 index 0000000000..a68c9d8b2f --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -0,0 +1,1016 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * NBNXM SYCL kernels + * + * \ingroup module_nbnxm + */ +#include "gmxpre.h" + +#include "nbnxm_sycl_kernel.h" + +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/math/functions.h" +#include "gromacs/mdtypes/simulation_workload.h" +#include "gromacs/pbcutil/ishift.h" +#include "gromacs/utility/template_mp.h" + +#include "nbnxm_sycl_kernel_utils.h" +#include "nbnxm_sycl_types.h" + +namespace Nbnxm +{ + +//! \brief Set of boolean constants mimicking preprocessor macros. +template +struct EnergyFunctionProperties { + static constexpr bool elecCutoff = (elecType == ElecType::Cut); ///< EL_CUTOFF + static constexpr bool elecRF = (elecType == ElecType::RF); ///< EL_RF + static constexpr bool elecEwaldAna = + (elecType == ElecType::EwaldAna || elecType == ElecType::EwaldAnaTwin); ///< EL_EWALD_ANA + static constexpr bool elecEwaldTab = + (elecType == ElecType::EwaldTab || elecType == ElecType::EwaldTabTwin); ///< EL_EWALD_TAB + static constexpr bool elecEwaldTwin = + (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin); + static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY + static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB); + static constexpr bool vdwCombGeom = (vdwType == VdwType::CutCombGeom); ///< LJ_COMB_GEOM + static constexpr bool vdwComb = (vdwCombLB || vdwCombGeom); ///< LJ_COMB + static constexpr bool vdwEwaldCombGeom = (vdwType == VdwType::EwaldGeom); ///< LJ_EWALD_COMB_GEOM + static constexpr bool vdwEwaldCombLB = (vdwType == VdwType::EwaldLB); ///< LJ_EWALD_COMB_LB + static constexpr bool vdwEwald = (vdwEwaldCombGeom || vdwEwaldCombLB); ///< LJ_EWALD + static constexpr bool vdwFSwitch = (vdwType == VdwType::FSwitch); ///< LJ_FORCE_SWITCH + static constexpr bool vdwPSwitch = (vdwType == VdwType::PSwitch); ///< LJ_POT_SWITCH +}; + +//! \brief Templated constants to shorten kernel function declaration. +//@{ +template +constexpr bool ljComb = EnergyFunctionProperties().vdwComb; + +template // Yes, ElecType +constexpr bool vdwCutoffCheck = EnergyFunctionProperties().elecEwaldTwin; + +template +constexpr bool elecEwald = EnergyFunctionProperties().elecEwald; + +template +constexpr bool elecEwaldTab = EnergyFunctionProperties().elecEwaldTab; + +template +constexpr bool ljEwald = EnergyFunctionProperties().vdwEwald; +//@} + +using cl::sycl::access::fence_space; +using cl::sycl::access::mode; +using cl::sycl::access::target; + +static inline void convertSigmaEpsilonToC6C12(const float sigma, + const float epsilon, + cl::sycl::private_ptr c6, + cl::sycl::private_ptr c12) +{ + const float sigma2 = sigma * sigma; + const float sigma6 = sigma2 * sigma2 * sigma2; + *c6 = epsilon * sigma6; + *c12 = (*c6) * sigma6; +} + +template +static inline void ljForceSwitch(const shift_consts_t dispersionShift, + const shift_consts_t repulsionShift, + const float rVdwSwitch, + const float c6, + const float c12, + const float rInv, + const float r2, + cl::sycl::private_ptr fInvR, + cl::sycl::private_ptr eLJ) +{ + /* force switch constants */ + const float dispShiftV2 = dispersionShift.c2; + const float dispShiftV3 = dispersionShift.c3; + const float repuShiftV2 = repulsionShift.c2; + const float repuShiftV3 = repulsionShift.c3; + + const float r = r2 * rInv; + const float rSwitch = cl::sycl::fdim(r, rVdwSwitch); // max(r - rVdwSwitch, 0) + + *fInvR += -c6 * (dispShiftV2 + dispShiftV3 * rSwitch) * rSwitch * rSwitch * rInv + + c12 * (repuShiftV2 + repuShiftV3 * rSwitch) * rSwitch * rSwitch * rInv; + + if constexpr (doCalcEnergies) + { + const float dispShiftF2 = dispShiftV2 / 3; + const float dispShiftF3 = dispShiftV3 / 4; + const float repuShiftF2 = repuShiftV2 / 3; + const float repuShiftF3 = repuShiftV3 / 4; + *eLJ += c6 * (dispShiftF2 + dispShiftF3 * rSwitch) * rSwitch * rSwitch * rSwitch + - c12 * (repuShiftF2 + repuShiftF3 * rSwitch) * rSwitch * rSwitch * rSwitch; + } +} + +//! \brief Fetch C6 grid contribution coefficients and return the product of these. +template +static inline float calculateLJEwaldC6Grid(const DeviceAccessor a_nbfpComb, + const int typeI, + const int typeJ) +{ + if constexpr (vdwType == VdwType::EwaldGeom) + { + return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ]; + } + else + { + static_assert(vdwType == VdwType::EwaldLB); + /* sigma and epsilon are scaled to give 6*C6 */ + const float c6_i = a_nbfpComb[2 * typeI]; + const float c12_i = a_nbfpComb[2 * typeI + 1]; + const float c6_j = a_nbfpComb[2 * typeJ]; + const float c12_j = a_nbfpComb[2 * typeJ + 1]; + + const float sigma = c6_i + c6_j; + const float epsilon = c12_i * c12_j; + + const float sigma2 = sigma * sigma; + return epsilon * sigma2 * sigma2 * sigma2; + } +} + +//! Calculate LJ-PME grid force contribution with geometric or LB combination rule. +template +static inline void ljEwaldComb(const DeviceAccessor a_nbfpComb, + const float sh_lj_ewald, + const int typeI, + const int typeJ, + const float r2, + const float r2Inv, + const float lje_coeff2, + const float lje_coeff6_6, + const float int_bit, + cl::sycl::private_ptr fInvR, + cl::sycl::private_ptr eLJ) +{ + const float c6grid = calculateLJEwaldC6Grid(a_nbfpComb, typeI, typeJ); + + /* Recalculate inv_r6 without exclusion mask */ + const float inv_r6_nm = r2Inv * r2Inv * r2Inv; + const float cr2 = lje_coeff2 * r2; + const float expmcr2 = cl::sycl::exp(-cr2); + const float poly = 1.0F + cr2 + 0.5F * cr2 * cr2; + + /* Subtract the grid force from the total LJ force */ + *fInvR += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * r2Inv; + + if constexpr (doCalcEnergies) + { + /* Shift should be applied only to real LJ pairs */ + const float sh_mask = sh_lj_ewald * int_bit; + *eLJ += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask); + } +} + +/*! \brief Apply potential switch. */ +template +static inline void ljPotentialSwitch(const switch_consts_t vdwSwitch, + const float rVdwSwitch, + const float rInv, + const float r2, + cl::sycl::private_ptr fInvR, + cl::sycl::private_ptr eLJ) +{ + /* potential switch constants */ + const float switchV3 = vdwSwitch.c3; + const float switchV4 = vdwSwitch.c4; + const float switchV5 = vdwSwitch.c5; + const float switchF2 = 3 * vdwSwitch.c3; + const float switchF3 = 4 * vdwSwitch.c4; + const float switchF4 = 5 * vdwSwitch.c5; + + const float r = r2 * rInv; + const float rSwitch = r - rVdwSwitch; + + if (rSwitch > 0.0F) + { + const float sw = + 1.0F + (switchV3 + (switchV4 + switchV5 * rSwitch) * rSwitch) * rSwitch * rSwitch * rSwitch; + const float dsw = (switchF2 + (switchF3 + switchF4 * rSwitch) * rSwitch) * rSwitch * rSwitch; + + *fInvR = (*fInvR) * sw - rInv * (*eLJ) * dsw; + if constexpr (doCalcEnergies) + { + *eLJ *= sw; + } + } +} + + +/*! \brief Calculate analytical Ewald correction term. */ +static inline float pmeCorrF(const float z2) +{ + constexpr float FN6 = -1.7357322914161492954e-8F; + constexpr float FN5 = 1.4703624142580877519e-6F; + constexpr float FN4 = -0.000053401640219807709149F; + constexpr float FN3 = 0.0010054721316683106153F; + constexpr float FN2 = -0.019278317264888380590F; + constexpr float FN1 = 0.069670166153766424023F; + constexpr float FN0 = -0.75225204789749321333F; + + constexpr float FD4 = 0.0011193462567257629232F; + constexpr float FD3 = 0.014866955030185295499F; + constexpr float FD2 = 0.11583842382862377919F; + constexpr float FD1 = 0.50736591960530292870F; + constexpr float FD0 = 1.0F; + + const float z4 = z2 * z2; + + float polyFD0 = FD4 * z4 + FD2; + const float polyFD1 = FD3 * z4 + FD1; + polyFD0 = polyFD0 * z4 + FD0; + polyFD0 = polyFD1 * z2 + polyFD0; + + polyFD0 = 1.0F / polyFD0; + + float polyFN0 = FN6 * z4 + FN4; + float polyFN1 = FN5 * z4 + FN3; + polyFN0 = polyFN0 * z4 + FN2; + polyFN1 = polyFN1 * z4 + FN1; + polyFN0 = polyFN0 * z4 + FN0; + polyFN0 = polyFN1 * z2 + polyFN0; + + return polyFN0 * polyFD0; +} + +/*! \brief Linear interpolation using exactly two FMA operations. + * + * Implements numeric equivalent of: (1-t)*d0 + t*d1. + */ +template +static inline T lerp(T d0, T d1, T t) +{ + return fma(t, d1, fma(-t, d0, d0)); +} + +/*! \brief Interpolate Ewald coulomb force correction using the F*r table. */ +static inline float interpolateCoulombForceR(const DeviceAccessor a_coulombTab, + const float coulombTabScale, + const float r) +{ + const float normalized = coulombTabScale * r; + const int index = static_cast(normalized); + const float fraction = normalized - index; + + const float left = a_coulombTab[index]; + const float right = a_coulombTab[index + 1]; + + return lerp(left, right, fraction); // TODO: cl::sycl::mix +} + +static inline void reduceForceJShuffle(float3 f, + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int aidx, + DeviceAccessor a_f) +{ + static_assert(c_clSize == 8 || c_clSize == 4); + sycl_2020::sub_group sg = itemIdx.get_sub_group(); + + f[0] += shuffleDown(f[0], 1, sg); + f[1] += shuffleUp(f[1], 1, sg); + f[2] += shuffleDown(f[2], 1, sg); + if (tidxi & 1) + { + f[0] = f[1]; + } + + f[0] += shuffleDown(f[0], 2, sg); + f[2] += shuffleUp(f[2], 2, sg); + if (tidxi & 2) + { + f[0] = f[2]; + } + + if constexpr (c_clSize == 8) + { + f[0] += shuffleDown(f[0], 4, sg); + } + + if (tidxi < 3) + { + atomicFetchAdd(a_f, 3 * aidx + tidxi, f[0]); + } +} + + +/*! \brief Final i-force reduction. + * + * This implementation works only with power of two array sizes. + */ +static inline void reduceForceIAndFShift(cl::sycl::accessor sm_buf, + const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster], + const bool calcFShift, + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int tidxj, + const int sci, + const int shift, + DeviceAccessor a_f, + DeviceAccessor a_fShift) +{ + static constexpr int bufStride = c_clSize * c_clSize; + static constexpr int clSizeLog2 = gmx::StaticLog2::value; + const int tidx = tidxi + tidxj * c_clSize; + float fShiftBuf = 0; + for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++) + { + const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi; + /* store i forces in shmem */ + sm_buf[tidx] = fCiBuf[ciOffset][0]; + sm_buf[bufStride + tidx] = fCiBuf[ciOffset][1]; + sm_buf[2 * bufStride + tidx] = fCiBuf[ciOffset][2]; + itemIdx.barrier(fence_space::local_space); + + /* Reduce the initial c_clSize values for each i atom to half + * every step by using c_clSize * i threads. */ + int i = c_clSize / 2; + for (int j = clSizeLog2 - 1; j > 0; j--) + { + if (tidxj < i) + { + sm_buf[tidxj * c_clSize + tidxi] += sm_buf[(tidxj + i) * c_clSize + tidxi]; + sm_buf[bufStride + tidxj * c_clSize + tidxi] += + sm_buf[bufStride + (tidxj + i) * c_clSize + tidxi]; + sm_buf[2 * bufStride + tidxj * c_clSize + tidxi] += + sm_buf[2 * bufStride + (tidxj + i) * c_clSize + tidxi]; + } + i >>= 1; + itemIdx.barrier(fence_space::local_space); + } + + /* i == 1, last reduction step, writing to global mem */ + /* Split the reduction between the first 3 line threads + Threads with line id 0 will do the reduction for (float3).x components + Threads with line id 1 will do the reduction for (float3).y components + Threads with line id 2 will do the reduction for (float3).z components. */ + if (tidxj < 3) + { + const float f = + sm_buf[tidxj * bufStride + tidxi] + sm_buf[tidxj * bufStride + c_clSize + tidxi]; + atomicFetchAdd(a_f, 3 * aidx + tidxj, f); + if (calcFShift) + { + fShiftBuf += f; + } + } + itemIdx.barrier(fence_space::local_space); + } + /* add up local shift forces into global mem */ + if (calcFShift) + { + /* Only threads with tidxj < 3 will update fshift. + The threads performing the update must be the same as the threads + storing the reduction result above. */ + if (tidxj < 3) + { + atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf); + } + } +} + + +/*! \brief Main kernel for NBNXM. + * + */ +template +auto nbnxmKernel(cl::sycl::handler& cgh, + DeviceAccessor a_xq, + DeviceAccessor a_f, + DeviceAccessor a_shiftVec, + DeviceAccessor a_fShift, + OptionalAccessor a_energyElec, + OptionalAccessor a_energyVdw, + DeviceAccessor a_plistCJ4, + DeviceAccessor a_plistSci, + DeviceAccessor a_plistExcl, + OptionalAccessor> a_ljComb, + OptionalAccessor> a_atomTypes, + OptionalAccessor> a_nbfp, + OptionalAccessor> a_nbfpComb, + OptionalAccessor> a_coulombTab, + const int numTypes, + const float rCoulombSq, + const float rVdwSq, + const float twoKRf, + const float ewaldBeta, + const float rlistOuterSq, + const float ewaldShift, + const float epsFac, + const float ewaldCoeffLJ, + const float cRF, + const shift_consts_t dispersionShift, + const shift_consts_t repulsionShift, + const switch_consts_t vdwSwitch, + const float rVdwSwitch, + const float ljEwaldShift, + const float coulombTabScale, + const bool calcShift) +{ + static constexpr EnergyFunctionProperties props; + + cgh.require(a_xq); + cgh.require(a_f); + cgh.require(a_shiftVec); + cgh.require(a_fShift); + if constexpr (doCalcEnergies) + { + cgh.require(a_energyElec); + cgh.require(a_energyVdw); + } + cgh.require(a_plistCJ4); + cgh.require(a_plistSci); + cgh.require(a_plistExcl); + if constexpr (!props.vdwComb) + { + cgh.require(a_atomTypes); + cgh.require(a_nbfp); + } + else + { + cgh.require(a_ljComb); + } + if constexpr (props.vdwEwald) + { + cgh.require(a_nbfpComb); + } + if constexpr (props.elecEwaldTab) + { + cgh.require(a_coulombTab); + } + + // shmem buffer for i x+q pre-loading + cl::sycl::accessor sm_xq( + cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh); + + // shmem buffer for force reduction + // SYCL-TODO: Make into 3D; section 4.7.6.11 of SYCL2020 specs + cl::sycl::accessor sm_reductionBuffer( + cl::sycl::range<1>(c_clSize * c_clSize * DIM), cgh); + + auto sm_atomTypeI = [&]() { + if constexpr (!props.vdwComb) + { + return cl::sycl::accessor( + cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh); + } + else + { + return nullptr; + } + }(); + + auto sm_ljCombI = [&]() { + if constexpr (props.vdwComb) + { + return cl::sycl::accessor( + cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh); + } + else + { + return nullptr; + } + }(); + + /* Flag to control the calculation of exclusion forces in the kernel + * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion + * energy terms. */ + constexpr bool doExclusionForces = + (props.elecEwald || props.elecRF || props.vdwEwald || (props.elecCutoff && doCalcEnergies)); + + constexpr int subGroupSize = c_clSize * c_clSize / 2; + + return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]] + { + /* thread/block/warp id-s */ + const cl::sycl::id<3> localId = unflattenId(itemIdx.get_local_id()); + const unsigned tidxi = localId[0]; + const unsigned tidxj = localId[1]; + const unsigned tidx = tidxj * c_clSize + tidxi; + const unsigned tidxz = 0; + + // Group indexing was flat originally, no need to unflatten it. + const unsigned bidx = itemIdx.get_group(0); + + const sycl_2020::sub_group sg = itemIdx.get_sub_group(); + // Better use sg.get_group_range, but too much of the logic relies on it anyway + const unsigned widx = tidx / subGroupSize; + + float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) + { + fCiBuf[i] = float3(0.0F, 0.0F, 0.0F); + } + + const nbnxn_sci_t nbSci = a_plistSci[bidx]; + const int sci = nbSci.sci; + const int cij4Start = nbSci.cj4_ind_start; + const int cij4End = nbSci.cj4_ind_end; + + // Only needed if props.elecEwaldAna + const float beta2 = ewaldBeta * ewaldBeta; + const float beta3 = ewaldBeta * ewaldBeta * ewaldBeta; + + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += c_clSize) + { + /* Pre-load i-atom x and q into shared memory */ + const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i; + const int ai = ci * c_clSize + tidxi; + const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi); + + const float3 shift = a_shiftVec[nbSci.shift]; + float4 xqi = a_xq[ai]; + xqi += float4(shift[0], shift[1], shift[2], 0.0F); + xqi[3] *= epsFac; + sm_xq[cacheIdx] = xqi; + + if constexpr (!props.vdwComb) + { + // Pre-load the i-atom types into shared memory + sm_atomTypeI[cacheIdx] = a_atomTypes[ai]; + } + else + { + // Pre-load the LJ combination parameters into shared memory + sm_ljCombI[cacheIdx] = a_ljComb[ai]; + } + } + itemIdx.barrier(fence_space::local_space); + + float ewaldCoeffLJ_2, ewaldCoeffLJ_6_6; // Only needed if (props.vdwEwald) + if constexpr (props.vdwEwald) + { + ewaldCoeffLJ_2 = ewaldCoeffLJ * ewaldCoeffLJ; + ewaldCoeffLJ_6_6 = ewaldCoeffLJ_2 * ewaldCoeffLJ_2 * ewaldCoeffLJ_2 * c_oneSixth; + } + + float energyVdw, energyElec; // Only needed if (doCalcEnergies) + if constexpr (doCalcEnergies) + { + energyVdw = energyElec = 0.0F; + } + if constexpr (doCalcEnergies && doExclusionForces) + { + if (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster) + { + // we have the diagonal: add the charge and LJ self interaction energy term + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) + { + // TODO: Are there other options? + if constexpr (props.elecEwald || props.elecRF || props.elecCutoff) + { + const float qi = sm_xq[i][tidxi][3]; + energyElec += qi * qi; + } + if constexpr (props.vdwEwald) + { + energyVdw += + a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] + * (numTypes + 1) * 2]; + } + } + /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */ + if constexpr (props.vdwEwald) + { + energyVdw /= c_clSize; + energyVdw *= 0.5F * c_oneSixth * ewaldCoeffLJ_6_6; // c_OneTwelfth? + } + if constexpr (props.elecRF || props.elecCutoff) + { + // Correct for epsfac^2 due to adding qi^2 */ + energyElec /= epsFac * c_clSize; + energyElec *= -0.5F * cRF; + } + if constexpr (props.elecEwald) + { + // Correct for epsfac^2 due to adding qi^2 */ + energyElec /= epsFac * c_clSize; + energyElec *= -ewaldBeta * c_OneOverSqrtPi; /* last factor 1/sqrt(pi) */ + } + } // (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster) + } // (doCalcEnergies && doExclusionForces) + + // Only needed if (doExclusionForces) + const bool nonSelfInteraction = !(nbSci.shift == CENTRAL & tidxj <= tidxi); + + // loop over the j clusters = seen by any of the atoms in the current super-cluster + for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += 1) + { + unsigned imask = a_plistCJ4[j4].imei[widx].imask; + if (!doPruneNBL && !imask) + { + continue; + } + const int wexclIdx = a_plistCJ4[j4].imei[widx].excl_ind; + const unsigned wexcl = a_plistExcl[wexclIdx].pair[tidx & (subGroupSize - 1)]; // sg.get_local_linear_id() + for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) + { + const bool maskSet = + imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)); + if (!maskSet) + { + continue; + } + unsigned maskJI = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster)); + const int cj = a_plistCJ4[j4].cj[jm]; + const int aj = cj * c_clSize + tidxj; + + // load j atom data + const float4 xqj = a_xq[aj]; + + const float3 xj(xqj[0], xqj[1], xqj[2]); + const float qj = xqj[3]; + int atomTypeJ; // Only needed if (!props.vdwComb) + float2 ljCombJ; // Only needed if (props.vdwComb) + if constexpr (props.vdwComb) + { + ljCombJ = a_ljComb[aj]; + } + else + { + atomTypeJ = a_atomTypes[aj]; + } + + float3 fCjBuf(0.0F, 0.0F, 0.0F); + + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) + { + if (imask & maskJI) + { + // i cluster index + const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; + // all threads load an atom from i cluster ci into shmem! + const float4 xqi = sm_xq[i][tidxi]; + const float3 xi(xqi[0], xqi[1], xqi[2]); + + // distance between i and j atoms + const float3 rv = xi - xj; + float r2 = norm2(rv); + + if constexpr (doPruneNBL) + { + /* If _none_ of the atoms pairs are in cutoff range, + * the bit corresponding to the current + * cluster-pair in imask gets set to 0. */ + if (!sycl_2020::group_any_of(sg, r2 < rlistOuterSq)) + { + imask &= ~maskJI; + } + } + const float pairExclMask = (wexcl & maskJI) ? 1.0F : 0.0F; + + // cutoff & exclusion check + + const bool notExcluded = doExclusionForces ? (nonSelfInteraction | (ci != cj)) + : (wexcl & maskJI); + + // SYCL-TODO: Check optimal way of branching here. + if ((r2 < rCoulombSq) && notExcluded) + { + const float qi = xqi[3]; + int atomTypeI; // Only needed if (!props.vdwComb) + float c6, c12, sigma, epsilon; + + if constexpr (!props.vdwComb) + { + /* LJ 6*C6 and 12*C12 */ + atomTypeI = sm_atomTypeI[i][tidxi]; + const int idx = (numTypes * atomTypeI + atomTypeJ) * 2; + c6 = a_nbfp[idx]; // TODO: Make a_nbfm into float2 + c12 = a_nbfp[idx + 1]; + } + else + { + const float2 ljCombI = sm_ljCombI[i][tidxi]; + if constexpr (props.vdwCombGeom) + { + c6 = ljCombI[0] * ljCombJ[0]; + c12 = ljCombI[1] * ljCombJ[1]; + } + else + { + static_assert(props.vdwCombLB); + // LJ 2^(1/6)*sigma and 12*epsilon + sigma = ljCombI[0] + ljCombJ[0]; + epsilon = ljCombI[1] * ljCombJ[1]; + if constexpr (doCalcEnergies) + { + convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12); + } + } // props.vdwCombGeom + } // !props.vdwComb + + // Ensure distance do not become so small that r^-12 overflows + r2 = std::max(r2, c_nbnxnMinDistanceSquared); + // SYCL-TODO: sycl::half_precision::rsqrt? + const float rInv = cl::sycl::native::rsqrt(r2); + const float r2Inv = rInv * rInv; + float r6Inv, fInvR, energyLJPair; + if constexpr (!props.vdwCombLB || doCalcEnergies) + { + r6Inv = r2Inv * r2Inv * r2Inv; + if constexpr (doExclusionForces) + { + // SYCL-TODO: Check if true for SYCL + /* We could mask r2Inv, but with Ewald masking both + * r6Inv and fInvR is faster */ + r6Inv *= pairExclMask; + } + fInvR = r6Inv * (c12 * r6Inv - c6) * r2Inv; + } + else + { + float sig_r = sigma * rInv; + float sig_r2 = sig_r * sig_r; + float sig_r6 = sig_r2 * sig_r2 * sig_r2; + if constexpr (doExclusionForces) + { + sig_r6 *= pairExclMask; + } + fInvR = epsilon * sig_r6 * (sig_r6 - 1.0F) * r2Inv; + } // (!props.vdwCombLB || doCalcEnergies) + if constexpr (doCalcEnergies || props.vdwPSwitch) + { + energyLJPair = pairExclMask + * (c12 * (r6Inv * r6Inv + repulsionShift.cpot) * c_oneTwelfth + - c6 * (r6Inv + dispersionShift.cpot) * c_oneSixth); + } + if constexpr (props.vdwFSwitch) + { + ljForceSwitch( + dispersionShift, repulsionShift, rVdwSwitch, c6, c12, rInv, r2, &fInvR, &energyLJPair); + } + if constexpr (props.vdwEwald) + { + ljEwaldComb(a_nbfpComb, + ljEwaldShift, + atomTypeI, + atomTypeJ, + r2, + r2Inv, + ewaldCoeffLJ_2, + ewaldCoeffLJ_6_6, + pairExclMask, + &fInvR, + &energyLJPair); + } // (props.vdwEwald) + if constexpr (props.vdwPSwitch) + { + ljPotentialSwitch( + vdwSwitch, rVdwSwitch, rInv, r2, &fInvR, &energyLJPair); + } + if constexpr (props.elecEwaldTwin) + { + // Separate VDW cut-off check to enable twin-range cut-offs + // (rVdw < rCoulomb <= rList) + const float vdwInRange = (r2 < rVdwSq) ? 1.0F : 0.0F; + fInvR *= vdwInRange; + if constexpr (doCalcEnergies) + { + energyLJPair *= vdwInRange; + } + } + if constexpr (doCalcEnergies) + { + energyVdw += energyLJPair; + } + + if constexpr (props.elecCutoff) + { + if constexpr (doExclusionForces) + { + fInvR += qi * qj * pairExclMask * r2Inv * rInv; + } + else + { + fInvR += qi * qj * r2Inv * rInv; + } + } + if constexpr (props.elecRF) + { + fInvR += qi * qj * (pairExclMask * r2Inv * rInv - twoKRf); + } + if constexpr (props.elecEwaldAna) + { + fInvR += qi * qj + * (pairExclMask * r2Inv * rInv + pmeCorrF(beta2 * r2) * beta3); + } + if constexpr (props.elecEwaldTab) + { + fInvR += qi * qj + * (pairExclMask * r2Inv + - interpolateCoulombForceR( + a_coulombTab, coulombTabScale, r2 * rInv)) + * rInv; + } + + if constexpr (doCalcEnergies) + { + if constexpr (props.elecCutoff) + { + energyElec += qi * qj * (pairExclMask * rInv - cRF); + } + if constexpr (props.elecRF) + { + energyElec += + qi * qj * (pairExclMask * rInv + 0.5f * twoKRf * r2 - cRF); + } + if constexpr (props.elecEwald) + { + energyElec += + qi * qj + * (rInv * (pairExclMask - cl::sycl::erf(r2 * rInv * ewaldBeta)) + - pairExclMask * ewaldShift); + } + } + + const float3 forceIJ = rv * fInvR; + + /* accumulate j forces in registers */ + fCjBuf -= forceIJ; + /* accumulate i forces in registers */ + fCiBuf[i] += forceIJ; + } // (r2 < rCoulombSq) && notExcluded + } // (imask & maskJI) + /* shift the mask bit by 1 */ + maskJI += maskJI; + } // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) + /* reduce j forces */ + reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f); + } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) + if constexpr (doPruneNBL) + { + /* Update the imask with the new one which does not contain the + * out of range clusters anymore. */ + a_plistCJ4[j4].imei[widx].imask = imask; + } + } // for (int j4 = cij4Start; j4 < cij4End; j4 += 1) + + /* skip central shifts when summing shift forces */ + const bool doCalcShift = (calcShift && !(nbSci.shift == CENTRAL)); + + reduceForceIAndFShift( + sm_reductionBuffer, fCiBuf, doCalcShift, itemIdx, tidxi, tidxj, sci, nbSci.shift, a_f, a_fShift); + + if constexpr (doCalcEnergies) + { + const float energyVdwGroup = sycl_2020::group_reduce( + itemIdx.get_group(), energyVdw, 0.0F, sycl_2020::plus()); + const float energyElecGroup = sycl_2020::group_reduce( + itemIdx.get_group(), energyElec, 0.0F, sycl_2020::plus()); + + if (tidx == 0) + { + atomicFetchAdd(a_energyVdw, 0, energyVdwGroup); + atomicFetchAdd(a_energyElec, 0, energyElecGroup); + } + } + }; +} + +// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020. +template +class NbnxmKernelName; + +template +cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args) +{ + // Should not be needed for SYCL2020. + using kernelNameType = NbnxmKernelName; + + /* Kernel launch config: + * - The thread block dimensions match the size of i-clusters, j-clusters, + * and j-cluster concurrency, in x, y, and z, respectively. + * - The 1D block-grid contains as many blocks as super-clusters. + */ + const int numBlocks = numSci; + const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, 1 }; + const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] }; + const cl::sycl::nd_range<3> range{ globalSize, blockSize }; + + cl::sycl::queue q = deviceStream.stream(); + + cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) { + auto kernel = nbnxmKernel( + cgh, std::forward(args)...); + cgh.parallel_for(flattenNDRange(range), kernel); + }); + + return e; +} + +template +cl::sycl::event chooseAndLaunchNbnxmKernel(bool doPruneNBL, + bool doCalcEnergies, + enum ElecType elecType, + enum VdwType vdwType, + Args&&... args) +{ + return gmx::dispatchTemplatedFunction( + [&](auto doPruneNBL_, auto doCalcEnergies_, auto elecType_, auto vdwType_) { + return launchNbnxmKernel( + std::forward(args)...); + }, + doPruneNBL, + doCalcEnergies, + elecType, + vdwType); +} + +void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc) +{ + sycl_atomdata_t* adat = nb->atdat; + NBParamGpu* nbp = nb->nbparam; + gpu_plist* plist = nb->plist[iloc]; + const bool doPruneNBL = (plist->haveFreshList && !nb->didPrune[iloc]); + const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; + + // Casting to float simplifies using atomic ops in the kernel + cl::sycl::buffer f(*adat->f.buffer_); + auto fAsFloat = f.reinterpret(f.get_count() * DIM); + cl::sycl::buffer fShift(*adat->fShift.buffer_); + auto fShiftAsFloat = fShift.reinterpret(fShift.get_count() * DIM); + + cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL, + stepWork.computeEnergy, + nbp->elecType, + nbp->vdwType, + deviceStream, + plist->nsci, + adat->xq, + fAsFloat, + adat->shiftVec, + fShiftAsFloat, + adat->eElec, + adat->eLJ, + plist->cj4, + plist->sci, + plist->excl, + adat->ljComb, + adat->atomTypes, + nbp->nbfp, + nbp->nbfp_comb, + nbp->coulomb_tab, + adat->numTypes, + nbp->rcoulomb_sq, + nbp->rvdw_sq, + nbp->two_k_rf, + nbp->ewald_beta, + nbp->rlistOuter_sq, + nbp->sh_ewald, + nbp->epsfac, + nbp->ewaldcoeff_lj, + nbp->c_rf, + nbp->dispersion_shift, + nbp->repulsion_shift, + nbp->vdw_switch, + nbp->rvdw_switch, + nbp->sh_lj_ewald, + nbp->coulomb_tab_scale, + stepWork.computeVirial); +} + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h new file mode 100644 index 0000000000..dd7f52138d --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h @@ -0,0 +1,60 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Declares nbnxn sycl helper functions + * + * \ingroup module_nbnxm + */ +#ifndef GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H +#define GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H + +// Forward declarations +namespace gmx +{ +enum class InteractionLocality; +class StepWorkload; +} // namespace gmx +struct NbnxmGpu; + +namespace Nbnxm +{ +using gmx::InteractionLocality; + +void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc); + +} // namespace Nbnxm + +#endif // GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp new file mode 100644 index 0000000000..cb88eb31d6 --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -0,0 +1,287 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * NBNXM SYCL kernels + * + * \ingroup module_nbnxm + */ +#include "gmxpre.h" + +#include "nbnxm_sycl_kernel_pruneonly.h" + +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/utility/template_mp.h" + +#include "nbnxm_sycl_kernel_utils.h" +#include "nbnxm_sycl_types.h" + +using cl::sycl::access::fence_space; +using cl::sycl::access::mode; +using cl::sycl::access::target; + +namespace Nbnxm +{ + +/*! \brief Prune-only kernel for NBNXM. + * + */ +template +auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, + DeviceAccessor a_xq, + DeviceAccessor a_shiftVec, + DeviceAccessor a_plistCJ4, + DeviceAccessor a_plistSci, + DeviceAccessor a_plistIMask, + const float rlistOuterSq, + const float rlistInnerSq, + const int numParts, + const int part) +{ + cgh.require(a_xq); + cgh.require(a_shiftVec); + cgh.require(a_plistCJ4); + cgh.require(a_plistSci); + cgh.require(a_plistIMask); + + /* shmem buffer for i x+q pre-loading */ + cl::sycl::accessor sm_xq( + cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh); + + constexpr int warpSize = c_clSize * c_clSize / 2; + + /* Somewhat weird behavior inherited from OpenCL. + * With clSize == 4, we use sub_group size of 16 (not enforced in OpenCL implementation, but chosen + * by the IGC compiler), however for data layout we consider it to be 8. + * Setting sub_group size to 8 slows down the prune-only kernel 1.5-2 times. + * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness, + * but it causes very poor performance. + */ + constexpr int requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize; + + /* Requirements: + * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy + * to change). */ + return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]] + { + const cl::sycl::id<3> localId = unflattenId(itemIdx.get_local_id()); + // thread/block/warp id-s + const unsigned tidxi = localId[0]; + const unsigned tidxj = localId[1]; + const int tidx = tidxj * c_clSize + tidxi; + const unsigned tidxz = localId[2]; + const unsigned bidx = itemIdx.get_group(0); + + const sycl_2020::sub_group sg = itemIdx.get_sub_group(); + const unsigned widx = tidx / warpSize; + + // my i super-cluster's index = sciOffset + current bidx * numParts + part + const nbnxn_sci_t nbSci = a_plistSci[bidx * numParts + part]; + const int sci = nbSci.sci; /* super-cluster */ + const int cij4Start = nbSci.cj4_ind_start; /* first ...*/ + const int cij4End = nbSci.cj4_ind_end; /* and last index of j clusters */ + + if (tidxz == 0) + { + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += c_clSize) + { + /* Pre-load i-atom x and q into shared memory */ + const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i; + const int ai = ci * c_clSize + tidxi; + + /* We don't need q, but using float4 in shmem avoids bank conflicts. + (but it also wastes L2 bandwidth). */ + const float4 xq = a_xq[ai]; + const float3 shift = a_shiftVec[nbSci.shift]; + const float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]); + sm_xq[tidxj + i][tidxi] = xi; + } + } + itemIdx.barrier(fence_space::local_space); + + /* loop over the j clusters = seen by any of the atoms in the current super-cluster. + * The loop stride c_syclPruneKernelJ4Concurrency ensures that consecutive warps-pairs are + * assigned consecutive j4's entries. */ + for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += c_syclPruneKernelJ4Concurrency) + { + unsigned imaskFull, imaskCheck, imaskNew; + + if constexpr (haveFreshList) + { + /* Read the mask from the list transferred from the CPU */ + imaskFull = a_plistCJ4[j4].imei[widx].imask; + /* We attempt to prune all pairs present in the original list */ + imaskCheck = imaskFull; + imaskNew = 0; + } + else + { + /* Read the mask from the "warp-pruned" by rlistOuter mask array */ + imaskFull = a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx]; + /* Read the old rolling pruned mask, use as a base for new */ + imaskNew = a_plistCJ4[j4].imei[widx].imask; + /* We only need to check pairs with different mask */ + imaskCheck = (imaskNew ^ imaskFull); + } + + if (imaskCheck) + { + for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) + { + if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster))) + { + unsigned mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster)); + // SYCL-TODO: Reevaluate prefetching methods + const int cj = a_plistCJ4[j4].cj[jm]; + const int aj = cj * c_clSize + tidxj; + + /* load j atom data */ + const float4 tmp = a_xq[aj]; + const float3 xj(tmp[0], tmp[1], tmp[2]); + + for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) + { + if (imaskCheck & mask_ji) + { + // load i-cluster coordinates from shmem + const float4 xi = sm_xq[i][tidxi]; + // distance between i and j atoms + float3 rv(xi[0], xi[1], xi[2]); + rv -= xj; + const float r2 = norm2(rv); + + /* If _none_ of the atoms pairs are in rlistOuter + * range, the bit corresponding to the current + * cluster-pair in imask gets set to 0. */ + if (haveFreshList && !(sycl_2020::group_any_of(sg, r2 < rlistOuterSq))) + { + imaskFull &= ~mask_ji; + } + /* If any atom pair is within range, set the bit + * corresponding to the current cluster-pair. */ + if (sycl_2020::group_any_of(sg, r2 < rlistInnerSq)) + { + imaskNew |= mask_ji; + } + } // (imaskCheck & mask_ji) + /* shift the mask bit by 1 */ + mask_ji += mask_ji; + } // (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) + } // (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster))) + } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) + + if constexpr (haveFreshList) + { + /* copy the list pruned to rlistOuter to a separate buffer */ + a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx] = imaskFull; + } + /* update the imask with only the pairs up to rlistInner */ + a_plistCJ4[j4].imei[widx].imask = imaskNew; + } // (imaskCheck) + } // for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += c_syclPruneKernelJ4Concurrency) + }; +} + +// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020. +template +class NbnxmKernelPruneOnlyName; + +template +cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream, + const int numSciInPart, + Args&&... args) +{ + // Should not be needed for SYCL2020. + using kernelNameType = NbnxmKernelPruneOnlyName; + + /* Kernel launch config: + * - The thread block dimensions match the size of i-clusters, j-clusters, + * and j-cluster concurrency, in x, y, and z, respectively. + * - The 1D block-grid contains as many blocks as super-clusters. + */ + const unsigned long numBlocks = numSciInPart; + const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, c_syclPruneKernelJ4Concurrency }; + const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] }; + const cl::sycl::nd_range<3> range{ globalSize, blockSize }; + + cl::sycl::queue q = deviceStream.stream(); + + cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) { + auto kernel = nbnxmKernelPruneOnly(cgh, std::forward(args)...); + cgh.parallel_for(flattenNDRange(range), kernel); + }); + + return e; +} + +template +cl::sycl::event chooseAndLaunchNbnxmKernelPruneOnly(bool haveFreshList, Args&&... args) +{ + return gmx::dispatchTemplatedFunction( + [&](auto haveFreshList_) { + return launchNbnxmKernelPruneOnly(std::forward(args)...); + }, + haveFreshList); +} + +void launchNbnxmKernelPruneOnly(NbnxmGpu* nb, + const InteractionLocality iloc, + const int numParts, + const int part, + const int numSciInPart) +{ + sycl_atomdata_t* adat = nb->atdat; + NBParamGpu* nbp = nb->nbparam; + gpu_plist* plist = nb->plist[iloc]; + const bool haveFreshList = plist->haveFreshList; + const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; + + cl::sycl::event e = chooseAndLaunchNbnxmKernelPruneOnly(haveFreshList, + deviceStream, + numSciInPart, + adat->xq, + adat->shiftVec, + plist->cj4, + plist->sci, + plist->imask, + nbp->rlistOuter_sq, + nbp->rlistInner_sq, + numParts, + part); +} + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h new file mode 100644 index 0000000000..6af97b1d1e --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h @@ -0,0 +1,63 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Declares nbnxn sycl helper functions + * + * \ingroup module_nbnxm + */ +#ifndef GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H +#define GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H + +// Forward declarations +namespace gmx +{ +enum class InteractionLocality; +} +struct NbnxmGpu; + +namespace Nbnxm +{ +using gmx::InteractionLocality; + +void launchNbnxmKernelPruneOnly(NbnxmGpu* nb, + const InteractionLocality iloc, + const int numParts, + const int part, + const int numSciInPart); + +} // namespace Nbnxm + +#endif // GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h new file mode 100644 index 0000000000..c3aae4ebb5 --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h @@ -0,0 +1,127 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Helper functions and constants for SYCL NBNXM kernels + * + * \ingroup module_nbnxm + */ +#ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H +#define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H + +#include "gromacs/nbnxm/pairlist.h" +#include "gromacs/nbnxm/pairlistparams.h" + +namespace Nbnxm +{ + +#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY +# define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4 +#endif +/*! \brief Macro defining default for the prune kernel's j4 processing concurrency. + * + * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override. + */ +static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY; + +/* Convenience constants */ +/*! \cond */ +// cluster size = number of atoms per cluster. +static constexpr int c_clSize = c_nbnxnGpuClusterSize; +// j-cluster size after split (4 in the current implementation). +static constexpr int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit; +// i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set. +static constexpr unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); + +// 1/sqrt(pi), same value as \c M_FLOAT_1_SQRTPI in other NB kernels. +static constexpr float c_OneOverSqrtPi = 0.564189583547756F; +// 1/6, same value as in other NB kernels. +static constexpr float c_oneSixth = 0.16666667F; +// 1/12, same value as in other NB kernels. +static constexpr float c_oneTwelfth = 0.08333333F; +/*! \endcond */ + +/* The following functions are necessary because on some versions of Intel OpenCL RT, subgroups + * do not properly work (segfault or create subgroups of size 1) if used in kernels + * with non-1-dimensional workgroup. */ +//! \brief Convert 3D range to 1D +static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d) +{ + return cl::sycl::range<1>(range3d.size()); +} + +//! \brief Convert 3D nd_range to 1D +static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d) +{ + return cl::sycl::nd_range<1>(flattenRange(nd_range3d.get_global_range()), + flattenRange(nd_range3d.get_local_range())); +} + +//! \brief Convert flattened 1D index to 3D +template +static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d) +{ + constexpr unsigned rangeXY = rangeX * rangeY; + const unsigned id = id1d[0]; + const unsigned z = id / rangeXY; + const unsigned xy = id % rangeXY; + return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z); +} + +//! \brief Convenience wrapper to do atomic addition to a global buffer +template +static inline void atomicFetchAdd(DeviceAccessor acc, const IndexType idx, const float val) +{ + if (cl::sycl::isnormal(val)) + { + sycl_2020::atomic_ref + fout_atomic(acc[idx]); + fout_atomic.fetch_add(val); + } +} + +static inline float shuffleDown(float var, unsigned int delta, sycl_2020::sub_group sg) +{ + return sg.shuffle_down(var, delta); +} + +static inline float shuffleUp(float var, unsigned int delta, sycl_2020::sub_group sg) +{ + return sg.shuffle_up(var, delta); +} + +} // namespace Nbnxm + +#endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h new file mode 100644 index 0000000000..edfd2b7473 --- /dev/null +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h @@ -0,0 +1,190 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Data types used internally in the nbnxm_sycl module. + * + * \ingroup module_nbnxm + */ + +#ifndef NBNXM_SYCL_TYPES_H +#define NBNXM_SYCL_TYPES_H + +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/devicebuffer_sycl.h" +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/gpu_utils/syclutils.h" +#include "gromacs/nbnxm/gpu_types_common.h" +#include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/nbnxm/pairlist.h" +#include "gromacs/timing/gpu_timing.h" +#include "gromacs/utility/enumerationhelpers.h" + +/*! \internal + * \brief Staging area for temporary data downloaded from the GPU. + * + * Since SYCL buffers already have host-side storage, this is a bit redundant. + * But it allows prefetching of the data from GPU, and brings GPU backends closer together. + */ +struct nb_staging_t +{ + //! LJ energy + float* e_lj = nullptr; + //! electrostatic energy + float* e_el = nullptr; + //! shift forces + float3* fshift = nullptr; +}; + +/** \internal + * \brief Nonbonded atom data - both inputs and outputs. + */ +struct sycl_atomdata_t +{ + //! number of atoms + int natoms; + //! number of local atoms + int natoms_local; // + //! allocation size for the atom data (xq, f) + int numAlloc; + + //! atom coordinates + charges, size \ref natoms + DeviceBuffer xq; + //! force output array, size \ref natoms + DeviceBuffer f; + + //! LJ energy output, size 1 + DeviceBuffer eLJ; + //! Electrostatics energy input, size 1 + DeviceBuffer eElec; + + //! shift forces + DeviceBuffer fShift; + + //! number of atom types + int numTypes; + //! atom type indices, size \ref natoms + DeviceBuffer atomTypes; + //! sqrt(c6),sqrt(c12) size \ref natoms + DeviceBuffer ljComb; + + //! shifts + DeviceBuffer shiftVec; + //! true if the shift vector has been uploaded + bool shiftVecUploaded; +}; + +class GpuEventSynchronizer; + +/*! \internal + * \brief Main data structure for SYCL nonbonded force calculations. + */ +struct NbnxmGpu +{ + /*! \brief GPU device context. + * + * \todo Make it constant reference, once NbnxmGpu is a proper class. + */ + const DeviceContext* deviceContext_; + /*! \brief true if doing both local/non-local NB work on GPU */ + bool bUseTwoStreams = false; + /*! \brief true indicates that the nonlocal_done event was enqueued */ + bool bNonLocalStreamActive = false; + /*! \brief atom data */ + sycl_atomdata_t* atdat = nullptr; + /*! \brief f buf ops cell index mapping */ + + NBParamGpu* nbparam = nullptr; + /*! \brief pair-list data structures (local and non-local) */ + gmx::EnumerationArray plist = { { nullptr } }; + /*! \brief staging area where fshift/energies get downloaded. Will be removed in SYCL. */ + nb_staging_t nbst; + /*! \brief local and non-local GPU streams */ + gmx::EnumerationArray deviceStreams; + + /*! \brief True if event-based timing is enabled. Always false for SYCL. */ + bool bDoTime = false; + /*! \brief Dummy timers. */ + Nbnxm::gpu_timers_t* timers = nullptr; + /*! \brief Dummy timing data. */ + gmx_wallclock_gpu_nbnxn_t* timings = nullptr; + + //! true when a pair-list transfer has been done at this step + gmx::EnumerationArray didPairlistH2D = { { false } }; + //! true when we we did pruning on this step + gmx::EnumerationArray didPrune = { { false } }; + //! true when we did rolling pruning (at the previous step) + gmx::EnumerationArray didRollingPrune = { { false } }; + + /*! \brief Events used for synchronization. Would be deprecated in SYCL. */ + /*! \{ */ + /*! \brief Event triggered when the non-local non-bonded + * kernel is done (and the local transfer can proceed) */ + GpuEventSynchronizer nonlocal_done; + /*! \brief Event triggered when the tasks issued in the local + * stream that need to precede the non-local force or buffer + * operation calculations are done (e.g. f buffer 0-ing, local + * x/q H2D, buffer op initialization in local stream that is + * required also by nonlocal stream ) */ + GpuEventSynchronizer misc_ops_and_local_H2D_done; + /*! \} */ + + /*! \brief True if there is work for the current domain in the + * respective locality. + * + * This includes local/nonlocal GPU work, either bonded or + * nonbonded, scheduled to be executed in the current + * domain. As long as bonded work is not split up into + * local/nonlocal, if there is bonded GPU work, both flags + * will be true. */ + gmx::EnumerationArray haveWork = { { false } }; + + /*! \brief Pointer to event synchronizer triggered when the local + * GPU buffer ops / reduction is complete. Would be deprecated in SYCL. + * + * \note That the synchronizer is managed outside of this module + * in StatePropagatorDataGpu. + */ + GpuEventSynchronizer* localFReductionDone = nullptr; + + /*! \brief Event triggered when non-local coordinate buffer + * has been copied from device to host. Would be deprecated in SYCL. */ + GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr; +}; + +#endif /* NBNXM_SYCL_TYPES_H */ diff --git a/src/testutils/TestMacros.cmake b/src/testutils/TestMacros.cmake index 01e94e9cd6..bdd991af4b 100644 --- a/src/testutils/TestMacros.cmake +++ b/src/testutils/TestMacros.cmake @@ -214,7 +214,7 @@ function (gmx_register_gtest_test NAME EXENAME) # Both OpenCL (from JIT) and ThreadSanitizer (from how it # checks) can take signficantly more time than other # configurations. - if (GMX_GPU_OPENCL) + if (GMX_GPU_OPENCL OR GMX_GPU_SYCL) set(_timeout 240) elseif (${CMAKE_BUILD_TYPE} STREQUAL TSAN) set(_timeout 300)