# This file is part of the GROMACS molecular simulation package.
#
# Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
-# Copyright (c) 2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# Must add these files so they can include device_information.h
pme_gpu_internal.cpp
pme_gpu_timings.cpp
- )
+ )
elseif (GMX_GPU_OPENCL)
gmx_add_libgromacs_sources(
# OpenCL-specific sources
elseif (GMX_GPU_SYCL)
# SYCL-TODO: proper implementation
gmx_add_libgromacs_sources(
- pme_gpu_program_impl.cpp
+ # Files that implement stubs
+ pme_gpu_sycl_stubs.cpp
+ # GPU-specific sources
+ pme_gpu.cpp
+ pme_gpu_internal.cpp
+ pme_gpu_timings.cpp
)
_gmx_add_files_to_property(SYCL_SOURCES
- pme_gpu_program_impl.cpp
+ pme_gpu_internal.cpp
pme_gpu_program.cpp
+ pme_gpu_sycl_stubs.cpp
+ pme_gpu_timings.cpp
)
else()
gmx_add_libgromacs_sources(
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
# include "gromacs/gpu_utils/gmxopencl.h"
# include "gromacs/gpu_utils/gputraits_ocl.h"
+#elif GMX_GPU_SYCL
+# include "gromacs/gpu_utils/gputraits_sycl.h"
#endif
#include "gromacs/fft/fft.h" // for the enum gmx_fft_direction
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
# include "pme.cuh"
#elif GMX_GPU_OPENCL
# include "gromacs/gpu_utils/gmxopencl.h"
+#elif GMX_GPU_SYCL
+# include "gromacs/gpu_utils/syclutils.h"
#endif
#include "gromacs/ewald/pme.h"
kernelParamsPtr->fractShiftsTableTexture);
destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
kernelParamsPtr->gridlineIndicesTableTexture);
-#elif GMX_GPU_OPENCL
+#elif GMX_GPU_OPENCL || GMX_GPU_SYCL
freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements stubs of high-level PME GPU functions for SYCL.
+ *
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ *
+ * \ingroup module_ewald
+ */
+#include "gmxpre.h"
+
+#include "gromacs/ewald/ewald_utils.h"
+
+#include "pme_gpu_3dfft.h"
+#include "pme_gpu_internal.h"
+#include "pme_gpu_program_impl.h"
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
+ deviceContext_(deviceContext),
+ warpSize_(0),
+ spreadWorkGroupSize(0),
+ gatherWorkGroupSize(0),
+ solveMaxWorkGroupSize(0)
+{
+ // SYCL-TODO
+}
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl() = default;
+
+// [[noreturn]] attributes must be added in the common headers, so it's easier to silence the warning here
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+GpuParallel3dFft::GpuParallel3dFft(PmeGpu const* /*pmeGpu*/, int /*gridIndex*/)
+{
+ GMX_THROW(gmx::NotImplementedError("PME is not implemented in SYCL"));
+}
+
+GpuParallel3dFft::~GpuParallel3dFft() = default;
+
+void GpuParallel3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
+{
+ GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+#pragma clang diagnostic pop
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
struct PmeGpuCudaKernelParams;
/*! \brief A typedef for including the GPU kernel arguments data by pointer */
typedef PmeGpuCudaKernelParams PmeGpuKernelParams;
-#elif GMX_GPU_OPENCL
+#elif GMX_GPU_OPENCL || GMX_GPU_SYCL
struct PmeGpuKernelParamsBase;
/*! \brief A typedef for including the GPU kernel arguments data by pointer */
typedef PmeGpuKernelParamsBase PmeGpuKernelParams;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#elif GMX_GPU_OPENCL
# include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
# include "gromacs/gpu_utils/gpuregiontimer_ocl.h"
+#elif GMX_GPU_SYCL
+# include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+# include "gromacs/gpu_utils/gpuregiontimer_sycl.h"
#endif
#include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team.
+# Copyright (c) 2020,2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
)
_gmx_add_files_to_property(CUDA_SOURCES
device_stream_manager.cpp
- )
+ )
elseif(GMX_GPU_SYCL)
gmx_add_libgromacs_sources(
devicebuffer_sycl.cpp
device_context_sycl.cpp
device_stream_sycl.cpp
+ syclutils.cpp
)
_gmx_add_files_to_property(SYCL_SOURCES
devicebuffer_sycl.cpp
device_context_sycl.cpp
device_stream_manager.cpp
device_stream_sycl.cpp
+ syclutils.cpp
)
else()
gmx_add_libgromacs_sources(
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team.
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
# define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
# define OPENCL_FUNC_TERM REAL_FUNC_TERM
# define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+# define SYCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+# define SYCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+# define SYCL_FUNC_TERM REAL_FUNC_TERM
+# define SYCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
#else // Not DOXYGEN
/* GPU support is enabled, so these functions will have real code defined somewhere */
-# if GMX_GPU && !GMX_GPU_SYCL
+# if GMX_GPU
# define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
# define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
# define GPU_FUNC_TERM REAL_FUNC_TERM
# define GPU_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
# define GPU_FUNC_TERM NULL_FUNC_TERM
# define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
-#
# endif
/* Enable and disable platform-specific function implementations */
# define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
# endif
+# if GMX_GPU_SYCL
+# define SYCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+# define SYCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+# define SYCL_FUNC_TERM REAL_FUNC_TERM
+# define SYCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+# else
+# define SYCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+# define SYCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+# define SYCL_FUNC_TERM NULL_FUNC_TERM
+# define SYCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+# endif
+
#endif // ifdef DOXYGEN
#endif // GMX_GPU_UTILS_MACROS_H
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team.
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "config.h"
-#include <cassert>
-
#include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
#ifdef _MSC_VER
{
errorReasons.emplace_back("non-GPU build of GROMACS");
}
- if (GMX_GPU_SYCL)
- {
- errorReasons.emplace_back("SYCL build of GROMACS");
- }
return addMessageIfNotSupported(errorReasons, error);
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <cstddef>
#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/math/vectypes.h"
using DeviceTexture = void*;
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define utility routines for SYCL
+ *
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "syclutils.h"
+
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Allocates \p nbytes of host memory. Use \c pfree to free memory allocated with this function.
+ *
+ * \todo
+ * This function was copied from OpenCL implementation, not tuned for SYCL at all.
+ * Once SYCL2020 is out, might be worthwhile to look into USM and sycl::malloc_host / sycl::aligned_alloc_host.
+ * Overall, it is better to directly use sycl::buffer instead of pinned arrays. But this function
+ * is needed to compile some PME code with SYCL enabled, even if it is never used.
+ *
+ * \param[in,out] h_ptr Pointer where to store the address of the newly allocated buffer.
+ * \param[in] nbytes Size in bytes of the buffer to be allocated.
+ */
+void pmalloc(void** h_ptr, size_t nbytes)
+{
+ /* Need a temporary type whose size is 1 byte, so that the
+ * implementation of snew_aligned can cope without issuing
+ * warnings. */
+ auto** temporary = reinterpret_cast<std::byte**>(h_ptr);
+
+ /* 16-byte alignment inherited from OpenCL and does not sound unreasonable */
+ snew_aligned(*temporary, nbytes, 16);
+}
+
+/*! \brief Frees memory allocated with pmalloc.
+ *
+ * \param[in] h_ptr Buffer allocated with pmalloc that needs to be freed.
+ */
+void pfree(void* h_ptr)
+{
+ if (h_ptr)
+ {
+ sfree_aligned(h_ptr);
+ }
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declare utility routines for SYCL
+ *
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ * \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_SYCLUTILS_H
+#define GMX_GPU_UTILS_SYCLUTILS_H
+
+#include <string>
+
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+class DeviceStream;
+enum class GpuApiCallBehavior;
+
+/*! \internal
+ * \brief SYCL GPU runtime data
+ *
+ * The device runtime data is meant to hold objects associated with a GROMACS rank's
+ * (thread or process) use of a single device (multiple devices per rank is not
+ * implemented). These objects should be constructed at the point where a device
+ * gets assigned to a rank and released at when this assignment is no longer valid
+ * (i.e. at cleanup in the current implementation).
+ */
+struct gmx_device_runtime_data_t
+{
+};
+
+#ifndef DOXYGEN
+
+/*! \brief Allocate host memory in malloc style */
+void pmalloc(void** h_ptr, size_t nbytes);
+
+/*! \brief Free host memory in malloc style */
+void pfree(void* h_ptr);
+
+/* To properly mark function as [[noreturn]], we must do it everywhere it is declared, which
+ * will pollute common headers.*/
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+/*! \brief
+ * A wrapper function for setting up all the SYCL kernel arguments.
+ * Calls the recursive functions above.
+ *
+ * \tparam Args Types of all the kernel arguments
+ * \param[in] kernel Kernel function handle
+ * \param[in] config Kernel configuration for launching
+ * \param[in] argsPtrs Pointers to all the kernel arguments
+ * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument.
+ */
+template<typename... Args>
+void* prepareGpuKernelArguments(void* /*kernel*/, const KernelLaunchConfig& /*config*/, const Args*... /*argsPtrs*/)
+{
+ GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+/*! \brief Launches the SYCL kernel and handles the errors.
+ *
+ * \param[in] kernel Kernel function handle
+ * \param[in] config Kernel configuration for launching
+ * \param[in] deviceStream GPU stream to launch kernel in
+ * \param[in] timingEvent Timing event, fetched from GpuRegionTimer
+ * \param[in] kernelName Human readable kernel description, for error handling only
+ * \throws gmx::InternalError on kernel launch failure
+ */
+inline void launchGpuKernel(void* /*kernel*/,
+ const KernelLaunchConfig& /*config*/,
+ const DeviceStream& /*deviceStream*/,
+ CommandEvent* /*timingEvent*/,
+ const char* /*kernelName*/,
+ const void* /*kernelArgs*/)
+{
+ GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+/*! \brief Pretend to check a SYCL stream for unfinished work (dummy implementation).
+ *
+ * \returns Not implemented in SYCL.
+ */
+static inline bool haveStreamTasksCompleted(const DeviceStream& /* deviceStream */)
+{
+ GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+# pragma clang diagnostic pop
+
+#endif // !DOXYGEN
+
+#endif
return DeviceStatus::Compatible;
}
- if (syclDevice.is_accelerator()) // FPGAs and FPGA emulators
+ if (syclDevice.get_info<cl::sycl::info::device::local_mem_type>() == cl::sycl::info::local_mem_type::none)
{
+ // While some kernels (leapfrog) can run without shared/local memory, this is a bad sign
return DeviceStatus::Incompatible;
}
- else
+
+ if (syclDevice.is_host())
+ {
+ // Host device does not support subgroups or even querying for sub_group_sizes
+ return DeviceStatus::Incompatible;
+ }
+ const std::vector<size_t> supportedSubGroupSizes =
+ syclDevice.get_info<cl::sycl::info::device::sub_group_sizes>();
+ const size_t requiredSubGroupSizeForNBNXM = 8;
+ if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM)
+ == supportedSubGroupSizes.end())
+ {
+ return DeviceStatus::IncompatibleClusterSize;
+ }
+
+ /* Host device can not be used, because NBNXM requires sub-groups, which are not supported.
+ * Accelerators (FPGAs and their emulators) are not supported.
+ * So, the only viable options are CPUs and GPUs. */
+ const bool forceCpu = (getenv("GMX_SYCL_FORCE_CPU") != nullptr);
+
+ if (forceCpu && syclDevice.is_cpu())
+ {
+ return DeviceStatus::Compatible;
+ }
+ else if (!forceCpu && syclDevice.is_gpu())
{
return DeviceStatus::Compatible;
}
+ else
+ {
+ return DeviceStatus::Incompatible;
+ }
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{
errorReasons.emplace_back("not supported with OpenCL build of GROMACS");
}
+ if (GMX_GPU_SYCL)
+ {
+ errorReasons.emplace_back("not supported with SYCL build of GROMACS");
+ }
else if (!GMX_GPU)
{
errorReasons.emplace_back("not supported with CPU-only build of GROMACS");
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
_gmx_add_files_to_property(CUDA_SOURCES
nbnxm_gpu_data_mgmt.cpp
- )
+ )
endif()
if(GMX_GPU_OPENCL)
gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
endif()
+if(GMX_GPU_SYCL)
+ add_subdirectory(sycl)
+ gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
+ _gmx_add_files_to_property(SYCL_SOURCES nbnxm_gpu_data_mgmt.cpp nbnxm.cpp)
+endif()
+
set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${NBNXM_SOURCES} PARENT_SCOPE)
#target_link_libraries(nbnxm PUBLIC
target_link_libraries(nbnxm INTERFACE
utility
- )
\ No newline at end of file
+ )
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
# include "opencl/nbnxm_ocl_types.h"
#endif
+#if GMX_GPU_SYCL
+# include "sycl/nbnxm_sycl_types.h"
+#endif
+
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/listed_forces/gpubonded.h"
#include "gromacs/math/vec.h"
* \param[out] e_el Variable to accumulate electrostatic energy into
* \param[out] fshift Pointer to the array of shift forces to accumulate into
*/
-template<typename StagingData>
-static inline void gpu_reduce_staged_outputs(const StagingData& nbst,
+static inline void gpu_reduce_staged_outputs(const nb_staging_t& nbst,
const InteractionLocality iLocality,
const bool reduceEnergies,
const bool reduceFshift,
}
}
- /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
- nb->timers->interaction[iLocality].didPrune =
- nb->timers->interaction[iLocality].didRollingPrune = false;
+ /* Reset both pruning flags. */
+ if (nb->bDoTime)
+ {
+ nb->timers->interaction[iLocality].didPrune =
+ nb->timers->interaction[iLocality].didRollingPrune = false;
+ }
/* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
nb->plist[iLocality]->haveFreshList = false;
#include "config.h"
+#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/mdtypes/locality.h"
#include "gromacs/utility/enumerationhelpers.h"
# include "gromacs/gpu_utils/gpuregiontimer.cuh"
#endif
+#if GMX_GPU_SYCL
+# include "gromacs/gpu_utils/gpuregiontimer_sycl.h"
+#endif
+
/** \internal
* \brief Parameters required for the GPU nonbonded calculations.
*/
float two_k_rf;
//! Ewald/PME parameter
float ewald_beta;
- //! Ewald/PME correction term substracted from the direct-space potential
+ //! Ewald/PME correction term subtracted from the direct-space potential
float sh_ewald;
//! LJ-Ewald/PME correction term added to the correction potential
float sh_lj_ewald;
# include "opencl/nbnxm_ocl_types.h"
#endif
+#if GMX_GPU_SYCL
+# include "sycl/nbnxm_sycl_types.h"
+#endif
+
#include "nbnxm_gpu_data_mgmt.h"
#include "gromacs/hardware/device_information.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
#include "nbnxm_gpu.h"
#include "pairlistsets.h"
//! The i- and j-cluster size for GPU lists, 8 atoms for CUDA, set at compile time for OpenCL
#if GMX_GPU_OPENCL
static constexpr int c_nbnxnGpuClusterSize = GMX_OPENCL_NB_CLUSTER_SIZE;
+#elif GMX_GPU_SYCL
+static constexpr int c_nbnxnGpuClusterSize = 4;
#else
static constexpr int c_nbnxnGpuClusterSize = 8;
#endif
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2020,2021, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if (GMX_GPU_SYCL)
+ file(GLOB NBNXM_SYCL_SOURCES *.cpp)
+ set(NBNXM_SOURCES ${NBNXM_SOURCES} ${NBNXM_SYCL_SOURCES} PARENT_SCOPE)
+ _gmx_add_files_to_property(SYCL_SOURCES ${NBNXM_SYCL_SOURCES})
+endif()
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Data management and kernel launch functions for nbnxm sycl.
+ *
+ * \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "gromacs/nbnxm/gpu_common.h"
+#include "gromacs/utility/exceptions.h"
+
+#include "nbnxm_sycl_kernel.h"
+#include "nbnxm_sycl_kernel_pruneonly.h"
+#include "nbnxm_sycl_types.h"
+
+namespace Nbnxm
+{
+
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void gpu_launch_cpyback(NbnxmGpu* nb,
+ struct nbnxn_atomdata_t* nbatom,
+ const gmx::StepWorkload& stepWork,
+ const AtomLocality atomLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+ sycl_atomdata_t* adat = nb->atdat;
+
+ /* don't launch non-local copy-back if there was no non-local work to do */
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ {
+ nb->bNonLocalStreamActive = false;
+ return;
+ }
+
+ int adatBegin, adatLen;
+ getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
+
+ // With DD the local D2H transfer can only start after the non-local kernel has finished.
+ if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
+ {
+ nb->nonlocal_done.waitForEvent();
+ }
+
+ /* DtoH f
+ * Skip if buffer ops / reduction is offloaded to the GPU.
+ */
+ if (!stepWork.useGpuFBufferOps)
+ {
+ GMX_ASSERT(adat->f.elementSize() == sizeof(float3),
+ "The size of the force buffer element should be equal to the size of float3.");
+ copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adatBegin,
+ &adat->f,
+ adatBegin,
+ adatLen,
+ deviceStream,
+ GpuApiCallBehavior::Async,
+ nullptr);
+ }
+
+ /* After the non-local D2H is launched the nonlocal_done event can be
+ recorded which signals that the local D2H can proceed. This event is not
+ placed after the non-local kernel because we want the non-local data
+ back first. */
+ if (iloc == InteractionLocality::NonLocal)
+ {
+ nb->nonlocal_done.markEvent(deviceStream);
+ nb->bNonLocalStreamActive = true;
+ }
+
+ /* only transfer energies in the local stream */
+ if (iloc == InteractionLocality::Local)
+ {
+ /* DtoH fshift when virial is needed */
+ if (stepWork.computeVirial)
+ {
+ GMX_ASSERT(sizeof(*nb->nbst.fshift) == adat->fShift.elementSize(),
+ "Sizes of host- and device-side shift vector elements should be the same.");
+ copyFromDeviceBuffer(
+ nb->nbst.fshift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ }
+
+ /* DtoH energies */
+ if (stepWork.computeEnergy)
+ {
+ GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
+ "Sizes of host- and device-side LJ energy terms should be the same.");
+ copyFromDeviceBuffer(
+ nb->nbst.e_lj, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
+ "Sizes of host- and device-side electrostatic energy terms should be the "
+ "same.");
+ copyFromDeviceBuffer(
+ nb->nbst.e_el, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ }
+ }
+}
+
+/*! \brief Launch asynchronously the xq buffer host to device copy. */
+void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ validateGpuAtomLocality(atomLocality);
+
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+
+ sycl_atomdata_t* adat = nb->atdat;
+ gpu_plist* plist = nb->plist[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+ /* Don't launch the non-local H2D copy if there is no dependent
+ work to do: neither non-local nor other (e.g. bonded) work
+ to do that has as input the nbnxn coordinates.
+ Doing the same for the local kernel is more complicated, since the
+ local part of the force array also depends on the non-local kernel.
+ So to avoid complicating the code and to reduce the risk of bugs,
+ we always call the local local x+q copy (and the rest of the local
+ work in nbnxn_gpu_launch_kernel().
+ */
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ {
+ plist->haveFreshList = false;
+ return;
+ }
+
+ int adatBegin, adatLen;
+ getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
+
+ /* HtoD x, q */
+ GMX_ASSERT(adat->xq.elementSize() == sizeof(float4),
+ "The size of the xyzq buffer element should be equal to the size of float4.");
+ copyToDeviceBuffer(&adat->xq,
+ reinterpret_cast<const float4*>(nbatom->x().data()) + adatBegin,
+ adatBegin,
+ adatLen,
+ deviceStream,
+ GpuApiCallBehavior::Async,
+ nullptr);
+
+ /* No need to enforce stream synchronization with events like we do in CUDA/OpenCL.
+ * Runtime should do the scheduling correctly based on data dependencies. */
+}
+
+void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
+{
+ gpu_plist* plist = nb->plist[iloc];
+
+ if (plist->haveFreshList)
+ {
+ GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
+
+ /* Set rollingPruningNumParts to signal that it is not set */
+ plist->rollingPruningNumParts = 0;
+ plist->rollingPruningPart = 0;
+ }
+ else
+ {
+ if (plist->rollingPruningNumParts == 0)
+ {
+ plist->rollingPruningNumParts = numParts;
+ }
+ else
+ {
+ GMX_ASSERT(numParts == plist->rollingPruningNumParts,
+ "It is not allowed to change numParts in between list generation steps");
+ }
+ }
+
+ /* Use a local variable for part and update in plist, so we can return here
+ * without duplicating the part increment code.
+ */
+ const int part = plist->rollingPruningPart;
+
+ plist->rollingPruningPart++;
+ if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
+ {
+ plist->rollingPruningPart = 0;
+ }
+
+ /* Compute the number of list entries to prune in this pass */
+ const int numSciInPart = (plist->nsci - part) / numParts;
+
+ /* Don't launch the kernel if there is no work to do */
+ if (numSciInPart <= 0)
+ {
+ plist->haveFreshList = false;
+ return;
+ }
+
+ launchNbnxmKernelPruneOnly(nb, iloc, numParts, part, numSciInPart);
+
+ if (plist->haveFreshList)
+ {
+ plist->haveFreshList = false;
+ nb->didPrune[iloc] = true; // Mark that pruning has been done
+ }
+ else
+ {
+ nb->didRollingPrune[iloc] = true; // Mark that rolling pruning has been done
+ }
+}
+
+void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
+{
+ const NBParamGpu* nbp = nb->nbparam;
+ gpu_plist* plist = nb->plist[iloc];
+
+ if (canSkipNonbondedWork(*nb, iloc))
+ {
+ plist->haveFreshList = false;
+ return;
+ }
+
+ if (nbp->useDynamicPruning && plist->haveFreshList)
+ {
+ // Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
+ gpu_launch_kernel_pruneonly(nb, iloc, 1);
+ }
+
+ if (plist->nsci == 0)
+ {
+ /* Don't launch an empty local kernel */
+ return;
+ }
+
+ launchNbnxmKernel(nb, stepWork, iloc);
+}
+
+} // namespace Nbnxm
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Stubs of functions that must be defined by nbnxm sycl implementation.
+ *
+ * \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/device_stream_manager.h"
+#include "gromacs/hardware/device_information.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
+#include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/exceptions.h"
+
+#include "nbnxm_sycl_types.h"
+
+namespace Nbnxm
+{
+
+//! This function is documented in the header file
+void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
+{
+ sycl_atomdata_t* adat = nb->atdat;
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+ // Clear forces
+ clearDeviceBufferAsync(&adat->f, 0, nb->atdat->natoms, localStream);
+ // Clear shift force array and energies if the outputs were used in the current step
+ if (computeVirial)
+ {
+ clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
+ clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
+ clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
+ }
+}
+
+/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
+static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
+{
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+ sycl_atomdata_t* atomdata = nb->atdat;
+ atomdata->numTypes = numTypes;
+ allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
+ atomdata->shiftVecUploaded = false;
+
+ allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
+ allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
+ allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
+
+ clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
+ clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
+ clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
+
+ /* initialize to nullptr pointers to data that is not allocated here and will
+ need reallocation in later */
+ atomdata->xq = nullptr;
+ atomdata->f = nullptr;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ atomdata->natoms = -1;
+ atomdata->numAlloc = -1;
+}
+
+/*! \brief Initialize the nonbonded parameter data structure. */
+static void initNbparam(NBParamGpu* nbp,
+ const interaction_const_t& ic,
+ const PairlistParams& listParams,
+ const nbnxn_atomdata_t::Params& nbatParams,
+ const DeviceContext& deviceContext)
+{
+ const int numTypes = nbatParams.numTypes;
+
+ set_cutoff_parameters(nbp, &ic, listParams);
+
+ nbp->vdwType = nbnxmGpuPickVdwKernelType(&ic, nbatParams.ljCombinationRule);
+ nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(&ic, deviceContext.deviceInfo());
+
+ /* generate table for PME */
+ nbp->coulomb_tab = nullptr;
+ if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
+ {
+ GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
+ }
+
+ /* set up LJ parameter lookup table */
+ if (!useLjCombRule(nbp->vdwType))
+ {
+ initParamLookupTable(
+ &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext);
+ }
+
+ /* set up LJ-PME parameter lookup table */
+ if (ic.vdwtype == evdwPME)
+ {
+ initParamLookupTable(
+ &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext);
+ }
+}
+
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+ const interaction_const_t* ic,
+ const PairlistParams& listParams,
+ const nbnxn_atomdata_t* nbat,
+ const bool bLocalAndNonlocal)
+{
+ auto* nb = new NbnxmGpu();
+ nb->deviceContext_ = &deviceStreamManager.context();
+ nb->atdat = new sycl_atomdata_t;
+ nb->nbparam = new NBParamGpu;
+ nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
+ if (bLocalAndNonlocal)
+ {
+ nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
+ }
+
+ nb->bUseTwoStreams = bLocalAndNonlocal;
+
+ nb->timers = nullptr;
+ nb->timings = nullptr;
+
+ /* init nbst */
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+
+ init_plist(nb->plist[InteractionLocality::Local]);
+
+ /* local/non-local GPU streams */
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+ "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+ nb->deviceStreams[InteractionLocality::Local] =
+ &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
+ // out-of-order. But for the time being, it will be less disruptive to keep them.
+ if (nb->bUseTwoStreams)
+ {
+ init_plist(nb->plist[InteractionLocality::NonLocal]);
+
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+ "Non-local non-bonded stream should be initialized to use GPU for "
+ "non-bonded with domain decomposition.");
+ nb->deviceStreams[InteractionLocality::NonLocal] =
+ &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+ }
+
+ nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
+
+ nb->bDoTime = false;
+
+ const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
+ const DeviceContext& deviceContext = *nb->deviceContext_;
+
+ initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
+ initAtomdataFirst(nb, nbatParams.numTypes, deviceContext);
+
+ return nb;
+}
+
+void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
+{
+ sycl_atomdata_t* adat = nb->atdat;
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+
+ /* only if we have a dynamic box */
+ if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
+ {
+ GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]),
+ "Sizes of host- and device-side shift vectors should be the same.");
+ copyToDeviceBuffer(&adat->shiftVec,
+ reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+ 0,
+ SHIFTS,
+ localStream,
+ GpuApiCallBehavior::Async,
+ nullptr);
+ adat->shiftVecUploaded = true;
+ }
+}
+
+void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
+{
+ GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
+ sycl_atomdata_t* atdat = nb->atdat;
+ const DeviceContext& deviceContext = *nb->deviceContext_;
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+
+ int numAtoms = nbat->numAtoms();
+ bool reallocated = false;
+ if (numAtoms > atdat->numAlloc)
+ {
+ int numAlloc = over_alloc_small(numAtoms);
+
+ /* free up first if the arrays have already been initialized */
+ if (atdat->numAlloc != -1)
+ {
+ freeDeviceBuffer(&atdat->f);
+ freeDeviceBuffer(&atdat->xq);
+ freeDeviceBuffer(&atdat->atomTypes);
+ freeDeviceBuffer(&atdat->ljComb);
+ }
+
+ allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
+ allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
+ }
+ else
+ {
+ allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
+ }
+
+ atdat->numAlloc = numAlloc;
+ reallocated = true;
+ }
+
+ atdat->natoms = numAtoms;
+ atdat->natoms_local = nbat->natoms_local;
+
+ /* need to clear GPU f output if realloc happened */
+ if (reallocated)
+ {
+ clearDeviceBufferAsync(&atdat->f, 0, atdat->numAlloc, localStream);
+ }
+
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(float2),
+ "Size of the LJ parameters element should be equal to the size of float2.");
+ copyToDeviceBuffer(&atdat->ljComb,
+ reinterpret_cast<const float2*>(nbat->params().lj_comb.data()),
+ 0,
+ numAtoms,
+ localStream,
+ GpuApiCallBehavior::Async,
+ nullptr);
+ }
+ else
+ {
+ GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]),
+ "Sizes of host- and device-side atom types should be the same.");
+ copyToDeviceBuffer(&atdat->atomTypes,
+ nbat->params().type.data(),
+ 0,
+ numAtoms,
+ localStream,
+ GpuApiCallBehavior::Async,
+ nullptr);
+ }
+}
+
+void gpu_free(NbnxmGpu* nb)
+{
+ if (nb == nullptr)
+ {
+ return;
+ }
+
+ sycl_atomdata_t* atdat = nb->atdat;
+ NBParamGpu* nbparam = nb->nbparam;
+
+ if ((!nbparam->coulomb_tab)
+ && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
+ {
+ destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+ }
+
+ if (!useLjCombRule(nb->nbparam->vdwType))
+ {
+ destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
+ }
+
+ if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
+ {
+ destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+ }
+
+ /* Free plist */
+ auto* plist = nb->plist[InteractionLocality::Local];
+ delete plist;
+ if (nb->bUseTwoStreams)
+ {
+ auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
+ delete plist_nl;
+ }
+
+ /* Free nbst */
+ pfree(nb->nbst.e_lj);
+ nb->nbst.e_lj = nullptr;
+
+ pfree(nb->nbst.e_el);
+ nb->nbst.e_el = nullptr;
+
+ pfree(nb->nbst.fshift);
+ nb->nbst.fshift = nullptr;
+
+ delete atdat;
+ delete nbparam;
+ delete nb;
+}
+
+int gpu_min_ci_balanced(NbnxmGpu* nb)
+{
+ // SYCL-TODO: Logic and magic values taken from OpenCL
+ static constexpr unsigned int balancedFactor = 50;
+ if (nb == nullptr)
+ {
+ return 0;
+ }
+ const cl::sycl::device device = nb->deviceContext_->deviceInfo().syclDevice;
+ const int numComputeUnits = device.get_info<cl::sycl::info::device::max_compute_units>();
+ return balancedFactor * numComputeUnits;
+}
+
+} // namespace Nbnxm
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * NBNXM SYCL kernels
+ *
+ * \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "nbnxm_sycl_kernel.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/mdtypes/simulation_workload.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/template_mp.h"
+
+#include "nbnxm_sycl_kernel_utils.h"
+#include "nbnxm_sycl_types.h"
+
+namespace Nbnxm
+{
+
+//! \brief Set of boolean constants mimicking preprocessor macros.
+template<enum ElecType elecType, enum VdwType vdwType>
+struct EnergyFunctionProperties {
+ static constexpr bool elecCutoff = (elecType == ElecType::Cut); ///< EL_CUTOFF
+ static constexpr bool elecRF = (elecType == ElecType::RF); ///< EL_RF
+ static constexpr bool elecEwaldAna =
+ (elecType == ElecType::EwaldAna || elecType == ElecType::EwaldAnaTwin); ///< EL_EWALD_ANA
+ static constexpr bool elecEwaldTab =
+ (elecType == ElecType::EwaldTab || elecType == ElecType::EwaldTabTwin); ///< EL_EWALD_TAB
+ static constexpr bool elecEwaldTwin =
+ (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin);
+ static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY
+ static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB);
+ static constexpr bool vdwCombGeom = (vdwType == VdwType::CutCombGeom); ///< LJ_COMB_GEOM
+ static constexpr bool vdwComb = (vdwCombLB || vdwCombGeom); ///< LJ_COMB
+ static constexpr bool vdwEwaldCombGeom = (vdwType == VdwType::EwaldGeom); ///< LJ_EWALD_COMB_GEOM
+ static constexpr bool vdwEwaldCombLB = (vdwType == VdwType::EwaldLB); ///< LJ_EWALD_COMB_LB
+ static constexpr bool vdwEwald = (vdwEwaldCombGeom || vdwEwaldCombLB); ///< LJ_EWALD
+ static constexpr bool vdwFSwitch = (vdwType == VdwType::FSwitch); ///< LJ_FORCE_SWITCH
+ static constexpr bool vdwPSwitch = (vdwType == VdwType::PSwitch); ///< LJ_POT_SWITCH
+};
+
+//! \brief Templated constants to shorten kernel function declaration.
+//@{
+template<enum VdwType vdwType>
+constexpr bool ljComb = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwComb;
+
+template<enum ElecType elecType> // Yes, ElecType
+constexpr bool vdwCutoffCheck = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTwin;
+
+template<enum ElecType elecType>
+constexpr bool elecEwald = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwald;
+
+template<enum ElecType elecType>
+constexpr bool elecEwaldTab = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTab;
+
+template<enum VdwType vdwType>
+constexpr bool ljEwald = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwEwald;
+//@}
+
+using cl::sycl::access::fence_space;
+using cl::sycl::access::mode;
+using cl::sycl::access::target;
+
+static inline void convertSigmaEpsilonToC6C12(const float sigma,
+ const float epsilon,
+ cl::sycl::private_ptr<float> c6,
+ cl::sycl::private_ptr<float> c12)
+{
+ const float sigma2 = sigma * sigma;
+ const float sigma6 = sigma2 * sigma2 * sigma2;
+ *c6 = epsilon * sigma6;
+ *c12 = (*c6) * sigma6;
+}
+
+template<bool doCalcEnergies>
+static inline void ljForceSwitch(const shift_consts_t dispersionShift,
+ const shift_consts_t repulsionShift,
+ const float rVdwSwitch,
+ const float c6,
+ const float c12,
+ const float rInv,
+ const float r2,
+ cl::sycl::private_ptr<float> fInvR,
+ cl::sycl::private_ptr<float> eLJ)
+{
+ /* force switch constants */
+ const float dispShiftV2 = dispersionShift.c2;
+ const float dispShiftV3 = dispersionShift.c3;
+ const float repuShiftV2 = repulsionShift.c2;
+ const float repuShiftV3 = repulsionShift.c3;
+
+ const float r = r2 * rInv;
+ const float rSwitch = cl::sycl::fdim(r, rVdwSwitch); // max(r - rVdwSwitch, 0)
+
+ *fInvR += -c6 * (dispShiftV2 + dispShiftV3 * rSwitch) * rSwitch * rSwitch * rInv
+ + c12 * (repuShiftV2 + repuShiftV3 * rSwitch) * rSwitch * rSwitch * rInv;
+
+ if constexpr (doCalcEnergies)
+ {
+ const float dispShiftF2 = dispShiftV2 / 3;
+ const float dispShiftF3 = dispShiftV3 / 4;
+ const float repuShiftF2 = repuShiftV2 / 3;
+ const float repuShiftF3 = repuShiftV3 / 4;
+ *eLJ += c6 * (dispShiftF2 + dispShiftF3 * rSwitch) * rSwitch * rSwitch * rSwitch
+ - c12 * (repuShiftF2 + repuShiftF3 * rSwitch) * rSwitch * rSwitch * rSwitch;
+ }
+}
+
+//! \brief Fetch C6 grid contribution coefficients and return the product of these.
+template<enum VdwType vdwType>
+static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::read> a_nbfpComb,
+ const int typeI,
+ const int typeJ)
+{
+ if constexpr (vdwType == VdwType::EwaldGeom)
+ {
+ return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ];
+ }
+ else
+ {
+ static_assert(vdwType == VdwType::EwaldLB);
+ /* sigma and epsilon are scaled to give 6*C6 */
+ const float c6_i = a_nbfpComb[2 * typeI];
+ const float c12_i = a_nbfpComb[2 * typeI + 1];
+ const float c6_j = a_nbfpComb[2 * typeJ];
+ const float c12_j = a_nbfpComb[2 * typeJ + 1];
+
+ const float sigma = c6_i + c6_j;
+ const float epsilon = c12_i * c12_j;
+
+ const float sigma2 = sigma * sigma;
+ return epsilon * sigma2 * sigma2 * sigma2;
+ }
+}
+
+//! Calculate LJ-PME grid force contribution with geometric or LB combination rule.
+template<bool doCalcEnergies, enum VdwType vdwType>
+static inline void ljEwaldComb(const DeviceAccessor<float, mode::read> a_nbfpComb,
+ const float sh_lj_ewald,
+ const int typeI,
+ const int typeJ,
+ const float r2,
+ const float r2Inv,
+ const float lje_coeff2,
+ const float lje_coeff6_6,
+ const float int_bit,
+ cl::sycl::private_ptr<float> fInvR,
+ cl::sycl::private_ptr<float> eLJ)
+{
+ const float c6grid = calculateLJEwaldC6Grid<vdwType>(a_nbfpComb, typeI, typeJ);
+
+ /* Recalculate inv_r6 without exclusion mask */
+ const float inv_r6_nm = r2Inv * r2Inv * r2Inv;
+ const float cr2 = lje_coeff2 * r2;
+ const float expmcr2 = cl::sycl::exp(-cr2);
+ const float poly = 1.0F + cr2 + 0.5F * cr2 * cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *fInvR += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * r2Inv;
+
+ if constexpr (doCalcEnergies)
+ {
+ /* Shift should be applied only to real LJ pairs */
+ const float sh_mask = sh_lj_ewald * int_bit;
+ *eLJ += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
+ }
+}
+
+/*! \brief Apply potential switch. */
+template<bool doCalcEnergies>
+static inline void ljPotentialSwitch(const switch_consts_t vdwSwitch,
+ const float rVdwSwitch,
+ const float rInv,
+ const float r2,
+ cl::sycl::private_ptr<float> fInvR,
+ cl::sycl::private_ptr<float> eLJ)
+{
+ /* potential switch constants */
+ const float switchV3 = vdwSwitch.c3;
+ const float switchV4 = vdwSwitch.c4;
+ const float switchV5 = vdwSwitch.c5;
+ const float switchF2 = 3 * vdwSwitch.c3;
+ const float switchF3 = 4 * vdwSwitch.c4;
+ const float switchF4 = 5 * vdwSwitch.c5;
+
+ const float r = r2 * rInv;
+ const float rSwitch = r - rVdwSwitch;
+
+ if (rSwitch > 0.0F)
+ {
+ const float sw =
+ 1.0F + (switchV3 + (switchV4 + switchV5 * rSwitch) * rSwitch) * rSwitch * rSwitch * rSwitch;
+ const float dsw = (switchF2 + (switchF3 + switchF4 * rSwitch) * rSwitch) * rSwitch * rSwitch;
+
+ *fInvR = (*fInvR) * sw - rInv * (*eLJ) * dsw;
+ if constexpr (doCalcEnergies)
+ {
+ *eLJ *= sw;
+ }
+ }
+}
+
+
+/*! \brief Calculate analytical Ewald correction term. */
+static inline float pmeCorrF(const float z2)
+{
+ constexpr float FN6 = -1.7357322914161492954e-8F;
+ constexpr float FN5 = 1.4703624142580877519e-6F;
+ constexpr float FN4 = -0.000053401640219807709149F;
+ constexpr float FN3 = 0.0010054721316683106153F;
+ constexpr float FN2 = -0.019278317264888380590F;
+ constexpr float FN1 = 0.069670166153766424023F;
+ constexpr float FN0 = -0.75225204789749321333F;
+
+ constexpr float FD4 = 0.0011193462567257629232F;
+ constexpr float FD3 = 0.014866955030185295499F;
+ constexpr float FD2 = 0.11583842382862377919F;
+ constexpr float FD1 = 0.50736591960530292870F;
+ constexpr float FD0 = 1.0F;
+
+ const float z4 = z2 * z2;
+
+ float polyFD0 = FD4 * z4 + FD2;
+ const float polyFD1 = FD3 * z4 + FD1;
+ polyFD0 = polyFD0 * z4 + FD0;
+ polyFD0 = polyFD1 * z2 + polyFD0;
+
+ polyFD0 = 1.0F / polyFD0;
+
+ float polyFN0 = FN6 * z4 + FN4;
+ float polyFN1 = FN5 * z4 + FN3;
+ polyFN0 = polyFN0 * z4 + FN2;
+ polyFN1 = polyFN1 * z4 + FN1;
+ polyFN0 = polyFN0 * z4 + FN0;
+ polyFN0 = polyFN1 * z2 + polyFN0;
+
+ return polyFN0 * polyFD0;
+}
+
+/*! \brief Linear interpolation using exactly two FMA operations.
+ *
+ * Implements numeric equivalent of: (1-t)*d0 + t*d1.
+ */
+template<typename T>
+static inline T lerp(T d0, T d1, T t)
+{
+ return fma(t, d1, fma(-t, d0, d0));
+}
+
+/*! \brief Interpolate Ewald coulomb force correction using the F*r table. */
+static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::read> a_coulombTab,
+ const float coulombTabScale,
+ const float r)
+{
+ const float normalized = coulombTabScale * r;
+ const int index = static_cast<int>(normalized);
+ const float fraction = normalized - index;
+
+ const float left = a_coulombTab[index];
+ const float right = a_coulombTab[index + 1];
+
+ return lerp(left, right, fraction); // TODO: cl::sycl::mix
+}
+
+static inline void reduceForceJShuffle(float3 f,
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int aidx,
+ DeviceAccessor<float, mode::read_write> a_f)
+{
+ static_assert(c_clSize == 8 || c_clSize == 4);
+ sycl_2020::sub_group sg = itemIdx.get_sub_group();
+
+ f[0] += shuffleDown(f[0], 1, sg);
+ f[1] += shuffleUp(f[1], 1, sg);
+ f[2] += shuffleDown(f[2], 1, sg);
+ if (tidxi & 1)
+ {
+ f[0] = f[1];
+ }
+
+ f[0] += shuffleDown(f[0], 2, sg);
+ f[2] += shuffleUp(f[2], 2, sg);
+ if (tidxi & 2)
+ {
+ f[0] = f[2];
+ }
+
+ if constexpr (c_clSize == 8)
+ {
+ f[0] += shuffleDown(f[0], 4, sg);
+ }
+
+ if (tidxi < 3)
+ {
+ atomicFetchAdd(a_f, 3 * aidx + tidxi, f[0]);
+ }
+}
+
+
+/*! \brief Final i-force reduction.
+ *
+ * This implementation works only with power of two array sizes.
+ */
+static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+ const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
+ const bool calcFShift,
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int sci,
+ const int shift,
+ DeviceAccessor<float, mode::read_write> a_f,
+ DeviceAccessor<float, mode::read_write> a_fShift)
+{
+ static constexpr int bufStride = c_clSize * c_clSize;
+ static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
+ const int tidx = tidxi + tidxj * c_clSize;
+ float fShiftBuf = 0;
+ for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++)
+ {
+ const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi;
+ /* store i forces in shmem */
+ sm_buf[tidx] = fCiBuf[ciOffset][0];
+ sm_buf[bufStride + tidx] = fCiBuf[ciOffset][1];
+ sm_buf[2 * bufStride + tidx] = fCiBuf[ciOffset][2];
+ itemIdx.barrier(fence_space::local_space);
+
+ /* Reduce the initial c_clSize values for each i atom to half
+ * every step by using c_clSize * i threads. */
+ int i = c_clSize / 2;
+ for (int j = clSizeLog2 - 1; j > 0; j--)
+ {
+ if (tidxj < i)
+ {
+ sm_buf[tidxj * c_clSize + tidxi] += sm_buf[(tidxj + i) * c_clSize + tidxi];
+ sm_buf[bufStride + tidxj * c_clSize + tidxi] +=
+ sm_buf[bufStride + (tidxj + i) * c_clSize + tidxi];
+ sm_buf[2 * bufStride + tidxj * c_clSize + tidxi] +=
+ sm_buf[2 * bufStride + (tidxj + i) * c_clSize + tidxi];
+ }
+ i >>= 1;
+ itemIdx.barrier(fence_space::local_space);
+ }
+
+ /* i == 1, last reduction step, writing to global mem */
+ /* Split the reduction between the first 3 line threads
+ Threads with line id 0 will do the reduction for (float3).x components
+ Threads with line id 1 will do the reduction for (float3).y components
+ Threads with line id 2 will do the reduction for (float3).z components. */
+ if (tidxj < 3)
+ {
+ const float f =
+ sm_buf[tidxj * bufStride + tidxi] + sm_buf[tidxj * bufStride + c_clSize + tidxi];
+ atomicFetchAdd(a_f, 3 * aidx + tidxj, f);
+ if (calcFShift)
+ {
+ fShiftBuf += f;
+ }
+ }
+ itemIdx.barrier(fence_space::local_space);
+ }
+ /* add up local shift forces into global mem */
+ if (calcFShift)
+ {
+ /* Only threads with tidxj < 3 will update fshift.
+ The threads performing the update must be the same as the threads
+ storing the reduction result above. */
+ if (tidxj < 3)
+ {
+ atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+ }
+ }
+}
+
+
+/*! \brief Main kernel for NBNXM.
+ *
+ */
+template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
+auto nbnxmKernel(cl::sycl::handler& cgh,
+ DeviceAccessor<float4, mode::read> a_xq,
+ DeviceAccessor<float, mode::read_write> a_f,
+ DeviceAccessor<float3, mode::read> a_shiftVec,
+ DeviceAccessor<float, mode::read_write> a_fShift,
+ OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
+ OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
+ DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
+ DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
+ DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
+ OptionalAccessor<float2, mode::read, ljComb<vdwType>> a_ljComb,
+ OptionalAccessor<int, mode::read, !ljComb<vdwType>> a_atomTypes,
+ OptionalAccessor<float, mode::read, !ljComb<vdwType>> a_nbfp,
+ OptionalAccessor<float, mode::read, ljEwald<vdwType>> a_nbfpComb,
+ OptionalAccessor<float, mode::read, elecEwaldTab<elecType>> a_coulombTab,
+ const int numTypes,
+ const float rCoulombSq,
+ const float rVdwSq,
+ const float twoKRf,
+ const float ewaldBeta,
+ const float rlistOuterSq,
+ const float ewaldShift,
+ const float epsFac,
+ const float ewaldCoeffLJ,
+ const float cRF,
+ const shift_consts_t dispersionShift,
+ const shift_consts_t repulsionShift,
+ const switch_consts_t vdwSwitch,
+ const float rVdwSwitch,
+ const float ljEwaldShift,
+ const float coulombTabScale,
+ const bool calcShift)
+{
+ static constexpr EnergyFunctionProperties<elecType, vdwType> props;
+
+ cgh.require(a_xq);
+ cgh.require(a_f);
+ cgh.require(a_shiftVec);
+ cgh.require(a_fShift);
+ if constexpr (doCalcEnergies)
+ {
+ cgh.require(a_energyElec);
+ cgh.require(a_energyVdw);
+ }
+ cgh.require(a_plistCJ4);
+ cgh.require(a_plistSci);
+ cgh.require(a_plistExcl);
+ if constexpr (!props.vdwComb)
+ {
+ cgh.require(a_atomTypes);
+ cgh.require(a_nbfp);
+ }
+ else
+ {
+ cgh.require(a_ljComb);
+ }
+ if constexpr (props.vdwEwald)
+ {
+ cgh.require(a_nbfpComb);
+ }
+ if constexpr (props.elecEwaldTab)
+ {
+ cgh.require(a_coulombTab);
+ }
+
+ // shmem buffer for i x+q pre-loading
+ cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+ cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+
+ // shmem buffer for force reduction
+ // SYCL-TODO: Make into 3D; section 4.7.6.11 of SYCL2020 specs
+ cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_reductionBuffer(
+ cl::sycl::range<1>(c_clSize * c_clSize * DIM), cgh);
+
+ auto sm_atomTypeI = [&]() {
+ if constexpr (!props.vdwComb)
+ {
+ return cl::sycl::accessor<int, 2, mode::read_write, target::local>(
+ cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+ }
+ else
+ {
+ return nullptr;
+ }
+ }();
+
+ auto sm_ljCombI = [&]() {
+ if constexpr (props.vdwComb)
+ {
+ return cl::sycl::accessor<float2, 2, mode::read_write, target::local>(
+ cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+ }
+ else
+ {
+ return nullptr;
+ }
+ }();
+
+ /* Flag to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms. */
+ constexpr bool doExclusionForces =
+ (props.elecEwald || props.elecRF || props.vdwEwald || (props.elecCutoff && doCalcEnergies));
+
+ constexpr int subGroupSize = c_clSize * c_clSize / 2;
+
+ return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
+ {
+ /* thread/block/warp id-s */
+ const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
+ const unsigned tidxi = localId[0];
+ const unsigned tidxj = localId[1];
+ const unsigned tidx = tidxj * c_clSize + tidxi;
+ const unsigned tidxz = 0;
+
+ // Group indexing was flat originally, no need to unflatten it.
+ const unsigned bidx = itemIdx.get_group(0);
+
+ const sycl_2020::sub_group sg = itemIdx.get_sub_group();
+ // Better use sg.get_group_range, but too much of the logic relies on it anyway
+ const unsigned widx = tidx / subGroupSize;
+
+ float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+ {
+ fCiBuf[i] = float3(0.0F, 0.0F, 0.0F);
+ }
+
+ const nbnxn_sci_t nbSci = a_plistSci[bidx];
+ const int sci = nbSci.sci;
+ const int cij4Start = nbSci.cj4_ind_start;
+ const int cij4End = nbSci.cj4_ind_end;
+
+ // Only needed if props.elecEwaldAna
+ const float beta2 = ewaldBeta * ewaldBeta;
+ const float beta3 = ewaldBeta * ewaldBeta * ewaldBeta;
+
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += c_clSize)
+ {
+ /* Pre-load i-atom x and q into shared memory */
+ const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
+ const int ai = ci * c_clSize + tidxi;
+ const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi);
+
+ const float3 shift = a_shiftVec[nbSci.shift];
+ float4 xqi = a_xq[ai];
+ xqi += float4(shift[0], shift[1], shift[2], 0.0F);
+ xqi[3] *= epsFac;
+ sm_xq[cacheIdx] = xqi;
+
+ if constexpr (!props.vdwComb)
+ {
+ // Pre-load the i-atom types into shared memory
+ sm_atomTypeI[cacheIdx] = a_atomTypes[ai];
+ }
+ else
+ {
+ // Pre-load the LJ combination parameters into shared memory
+ sm_ljCombI[cacheIdx] = a_ljComb[ai];
+ }
+ }
+ itemIdx.barrier(fence_space::local_space);
+
+ float ewaldCoeffLJ_2, ewaldCoeffLJ_6_6; // Only needed if (props.vdwEwald)
+ if constexpr (props.vdwEwald)
+ {
+ ewaldCoeffLJ_2 = ewaldCoeffLJ * ewaldCoeffLJ;
+ ewaldCoeffLJ_6_6 = ewaldCoeffLJ_2 * ewaldCoeffLJ_2 * ewaldCoeffLJ_2 * c_oneSixth;
+ }
+
+ float energyVdw, energyElec; // Only needed if (doCalcEnergies)
+ if constexpr (doCalcEnergies)
+ {
+ energyVdw = energyElec = 0.0F;
+ }
+ if constexpr (doCalcEnergies && doExclusionForces)
+ {
+ if (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
+ {
+ // we have the diagonal: add the charge and LJ self interaction energy term
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+ {
+ // TODO: Are there other options?
+ if constexpr (props.elecEwald || props.elecRF || props.elecCutoff)
+ {
+ const float qi = sm_xq[i][tidxi][3];
+ energyElec += qi * qi;
+ }
+ if constexpr (props.vdwEwald)
+ {
+ energyVdw +=
+ a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+ * (numTypes + 1) * 2];
+ }
+ }
+ /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+ if constexpr (props.vdwEwald)
+ {
+ energyVdw /= c_clSize;
+ energyVdw *= 0.5F * c_oneSixth * ewaldCoeffLJ_6_6; // c_OneTwelfth?
+ }
+ if constexpr (props.elecRF || props.elecCutoff)
+ {
+ // Correct for epsfac^2 due to adding qi^2 */
+ energyElec /= epsFac * c_clSize;
+ energyElec *= -0.5F * cRF;
+ }
+ if constexpr (props.elecEwald)
+ {
+ // Correct for epsfac^2 due to adding qi^2 */
+ energyElec /= epsFac * c_clSize;
+ energyElec *= -ewaldBeta * c_OneOverSqrtPi; /* last factor 1/sqrt(pi) */
+ }
+ } // (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
+ } // (doCalcEnergies && doExclusionForces)
+
+ // Only needed if (doExclusionForces)
+ const bool nonSelfInteraction = !(nbSci.shift == CENTRAL & tidxj <= tidxi);
+
+ // loop over the j clusters = seen by any of the atoms in the current super-cluster
+ for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += 1)
+ {
+ unsigned imask = a_plistCJ4[j4].imei[widx].imask;
+ if (!doPruneNBL && !imask)
+ {
+ continue;
+ }
+ const int wexclIdx = a_plistCJ4[j4].imei[widx].excl_ind;
+ const unsigned wexcl = a_plistExcl[wexclIdx].pair[tidx & (subGroupSize - 1)]; // sg.get_local_linear_id()
+ for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+ {
+ const bool maskSet =
+ imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster));
+ if (!maskSet)
+ {
+ continue;
+ }
+ unsigned maskJI = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
+ const int cj = a_plistCJ4[j4].cj[jm];
+ const int aj = cj * c_clSize + tidxj;
+
+ // load j atom data
+ const float4 xqj = a_xq[aj];
+
+ const float3 xj(xqj[0], xqj[1], xqj[2]);
+ const float qj = xqj[3];
+ int atomTypeJ; // Only needed if (!props.vdwComb)
+ float2 ljCombJ; // Only needed if (props.vdwComb)
+ if constexpr (props.vdwComb)
+ {
+ ljCombJ = a_ljComb[aj];
+ }
+ else
+ {
+ atomTypeJ = a_atomTypes[aj];
+ }
+
+ float3 fCjBuf(0.0F, 0.0F, 0.0F);
+
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+ {
+ if (imask & maskJI)
+ {
+ // i cluster index
+ const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i;
+ // all threads load an atom from i cluster ci into shmem!
+ const float4 xqi = sm_xq[i][tidxi];
+ const float3 xi(xqi[0], xqi[1], xqi[2]);
+
+ // distance between i and j atoms
+ const float3 rv = xi - xj;
+ float r2 = norm2(rv);
+
+ if constexpr (doPruneNBL)
+ {
+ /* If _none_ of the atoms pairs are in cutoff range,
+ * the bit corresponding to the current
+ * cluster-pair in imask gets set to 0. */
+ if (!sycl_2020::group_any_of(sg, r2 < rlistOuterSq))
+ {
+ imask &= ~maskJI;
+ }
+ }
+ const float pairExclMask = (wexcl & maskJI) ? 1.0F : 0.0F;
+
+ // cutoff & exclusion check
+
+ const bool notExcluded = doExclusionForces ? (nonSelfInteraction | (ci != cj))
+ : (wexcl & maskJI);
+
+ // SYCL-TODO: Check optimal way of branching here.
+ if ((r2 < rCoulombSq) && notExcluded)
+ {
+ const float qi = xqi[3];
+ int atomTypeI; // Only needed if (!props.vdwComb)
+ float c6, c12, sigma, epsilon;
+
+ if constexpr (!props.vdwComb)
+ {
+ /* LJ 6*C6 and 12*C12 */
+ atomTypeI = sm_atomTypeI[i][tidxi];
+ const int idx = (numTypes * atomTypeI + atomTypeJ) * 2;
+ c6 = a_nbfp[idx]; // TODO: Make a_nbfm into float2
+ c12 = a_nbfp[idx + 1];
+ }
+ else
+ {
+ const float2 ljCombI = sm_ljCombI[i][tidxi];
+ if constexpr (props.vdwCombGeom)
+ {
+ c6 = ljCombI[0] * ljCombJ[0];
+ c12 = ljCombI[1] * ljCombJ[1];
+ }
+ else
+ {
+ static_assert(props.vdwCombLB);
+ // LJ 2^(1/6)*sigma and 12*epsilon
+ sigma = ljCombI[0] + ljCombJ[0];
+ epsilon = ljCombI[1] * ljCombJ[1];
+ if constexpr (doCalcEnergies)
+ {
+ convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12);
+ }
+ } // props.vdwCombGeom
+ } // !props.vdwComb
+
+ // Ensure distance do not become so small that r^-12 overflows
+ r2 = std::max(r2, c_nbnxnMinDistanceSquared);
+ // SYCL-TODO: sycl::half_precision::rsqrt?
+ const float rInv = cl::sycl::native::rsqrt(r2);
+ const float r2Inv = rInv * rInv;
+ float r6Inv, fInvR, energyLJPair;
+ if constexpr (!props.vdwCombLB || doCalcEnergies)
+ {
+ r6Inv = r2Inv * r2Inv * r2Inv;
+ if constexpr (doExclusionForces)
+ {
+ // SYCL-TODO: Check if true for SYCL
+ /* We could mask r2Inv, but with Ewald masking both
+ * r6Inv and fInvR is faster */
+ r6Inv *= pairExclMask;
+ }
+ fInvR = r6Inv * (c12 * r6Inv - c6) * r2Inv;
+ }
+ else
+ {
+ float sig_r = sigma * rInv;
+ float sig_r2 = sig_r * sig_r;
+ float sig_r6 = sig_r2 * sig_r2 * sig_r2;
+ if constexpr (doExclusionForces)
+ {
+ sig_r6 *= pairExclMask;
+ }
+ fInvR = epsilon * sig_r6 * (sig_r6 - 1.0F) * r2Inv;
+ } // (!props.vdwCombLB || doCalcEnergies)
+ if constexpr (doCalcEnergies || props.vdwPSwitch)
+ {
+ energyLJPair = pairExclMask
+ * (c12 * (r6Inv * r6Inv + repulsionShift.cpot) * c_oneTwelfth
+ - c6 * (r6Inv + dispersionShift.cpot) * c_oneSixth);
+ }
+ if constexpr (props.vdwFSwitch)
+ {
+ ljForceSwitch<doCalcEnergies>(
+ dispersionShift, repulsionShift, rVdwSwitch, c6, c12, rInv, r2, &fInvR, &energyLJPair);
+ }
+ if constexpr (props.vdwEwald)
+ {
+ ljEwaldComb<doCalcEnergies, vdwType>(a_nbfpComb,
+ ljEwaldShift,
+ atomTypeI,
+ atomTypeJ,
+ r2,
+ r2Inv,
+ ewaldCoeffLJ_2,
+ ewaldCoeffLJ_6_6,
+ pairExclMask,
+ &fInvR,
+ &energyLJPair);
+ } // (props.vdwEwald)
+ if constexpr (props.vdwPSwitch)
+ {
+ ljPotentialSwitch<doCalcEnergies>(
+ vdwSwitch, rVdwSwitch, rInv, r2, &fInvR, &energyLJPair);
+ }
+ if constexpr (props.elecEwaldTwin)
+ {
+ // Separate VDW cut-off check to enable twin-range cut-offs
+ // (rVdw < rCoulomb <= rList)
+ const float vdwInRange = (r2 < rVdwSq) ? 1.0F : 0.0F;
+ fInvR *= vdwInRange;
+ if constexpr (doCalcEnergies)
+ {
+ energyLJPair *= vdwInRange;
+ }
+ }
+ if constexpr (doCalcEnergies)
+ {
+ energyVdw += energyLJPair;
+ }
+
+ if constexpr (props.elecCutoff)
+ {
+ if constexpr (doExclusionForces)
+ {
+ fInvR += qi * qj * pairExclMask * r2Inv * rInv;
+ }
+ else
+ {
+ fInvR += qi * qj * r2Inv * rInv;
+ }
+ }
+ if constexpr (props.elecRF)
+ {
+ fInvR += qi * qj * (pairExclMask * r2Inv * rInv - twoKRf);
+ }
+ if constexpr (props.elecEwaldAna)
+ {
+ fInvR += qi * qj
+ * (pairExclMask * r2Inv * rInv + pmeCorrF(beta2 * r2) * beta3);
+ }
+ if constexpr (props.elecEwaldTab)
+ {
+ fInvR += qi * qj
+ * (pairExclMask * r2Inv
+ - interpolateCoulombForceR(
+ a_coulombTab, coulombTabScale, r2 * rInv))
+ * rInv;
+ }
+
+ if constexpr (doCalcEnergies)
+ {
+ if constexpr (props.elecCutoff)
+ {
+ energyElec += qi * qj * (pairExclMask * rInv - cRF);
+ }
+ if constexpr (props.elecRF)
+ {
+ energyElec +=
+ qi * qj * (pairExclMask * rInv + 0.5f * twoKRf * r2 - cRF);
+ }
+ if constexpr (props.elecEwald)
+ {
+ energyElec +=
+ qi * qj
+ * (rInv * (pairExclMask - cl::sycl::erf(r2 * rInv * ewaldBeta))
+ - pairExclMask * ewaldShift);
+ }
+ }
+
+ const float3 forceIJ = rv * fInvR;
+
+ /* accumulate j forces in registers */
+ fCjBuf -= forceIJ;
+ /* accumulate i forces in registers */
+ fCiBuf[i] += forceIJ;
+ } // (r2 < rCoulombSq) && notExcluded
+ } // (imask & maskJI)
+ /* shift the mask bit by 1 */
+ maskJI += maskJI;
+ } // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+ /* reduce j forces */
+ reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f);
+ } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+ if constexpr (doPruneNBL)
+ {
+ /* Update the imask with the new one which does not contain the
+ * out of range clusters anymore. */
+ a_plistCJ4[j4].imei[widx].imask = imask;
+ }
+ } // for (int j4 = cij4Start; j4 < cij4End; j4 += 1)
+
+ /* skip central shifts when summing shift forces */
+ const bool doCalcShift = (calcShift && !(nbSci.shift == CENTRAL));
+
+ reduceForceIAndFShift(
+ sm_reductionBuffer, fCiBuf, doCalcShift, itemIdx, tidxi, tidxj, sci, nbSci.shift, a_f, a_fShift);
+
+ if constexpr (doCalcEnergies)
+ {
+ const float energyVdwGroup = sycl_2020::group_reduce(
+ itemIdx.get_group(), energyVdw, 0.0F, sycl_2020::plus<float>());
+ const float energyElecGroup = sycl_2020::group_reduce(
+ itemIdx.get_group(), energyElec, 0.0F, sycl_2020::plus<float>());
+
+ if (tidx == 0)
+ {
+ atomicFetchAdd(a_energyVdw, 0, energyVdwGroup);
+ atomicFetchAdd(a_energyElec, 0, energyElecGroup);
+ }
+ }
+ };
+}
+
+// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
+template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
+class NbnxmKernelName;
+
+template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
+cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args)
+{
+ // Should not be needed for SYCL2020.
+ using kernelNameType = NbnxmKernelName<doPruneNBL, doCalcEnergies, elecType, vdwType>;
+
+ /* Kernel launch config:
+ * - The thread block dimensions match the size of i-clusters, j-clusters,
+ * and j-cluster concurrency, in x, y, and z, respectively.
+ * - The 1D block-grid contains as many blocks as super-clusters.
+ */
+ const int numBlocks = numSci;
+ const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, 1 };
+ const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
+ const cl::sycl::nd_range<3> range{ globalSize, blockSize };
+
+ cl::sycl::queue q = deviceStream.stream();
+
+ cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
+ auto kernel = nbnxmKernel<doPruneNBL, doCalcEnergies, elecType, vdwType>(
+ cgh, std::forward<Args>(args)...);
+ cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+ });
+
+ return e;
+}
+
+template<class... Args>
+cl::sycl::event chooseAndLaunchNbnxmKernel(bool doPruneNBL,
+ bool doCalcEnergies,
+ enum ElecType elecType,
+ enum VdwType vdwType,
+ Args&&... args)
+{
+ return gmx::dispatchTemplatedFunction(
+ [&](auto doPruneNBL_, auto doCalcEnergies_, auto elecType_, auto vdwType_) {
+ return launchNbnxmKernel<doPruneNBL_, doCalcEnergies_, elecType_, vdwType_>(
+ std::forward<Args>(args)...);
+ },
+ doPruneNBL,
+ doCalcEnergies,
+ elecType,
+ vdwType);
+}
+
+void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
+{
+ sycl_atomdata_t* adat = nb->atdat;
+ NBParamGpu* nbp = nb->nbparam;
+ gpu_plist* plist = nb->plist[iloc];
+ const bool doPruneNBL = (plist->haveFreshList && !nb->didPrune[iloc]);
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+ // Casting to float simplifies using atomic ops in the kernel
+ cl::sycl::buffer<float3, 1> f(*adat->f.buffer_);
+ auto fAsFloat = f.reinterpret<float, 1>(f.get_count() * DIM);
+ cl::sycl::buffer<float3, 1> fShift(*adat->fShift.buffer_);
+ auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
+
+ cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,
+ stepWork.computeEnergy,
+ nbp->elecType,
+ nbp->vdwType,
+ deviceStream,
+ plist->nsci,
+ adat->xq,
+ fAsFloat,
+ adat->shiftVec,
+ fShiftAsFloat,
+ adat->eElec,
+ adat->eLJ,
+ plist->cj4,
+ plist->sci,
+ plist->excl,
+ adat->ljComb,
+ adat->atomTypes,
+ nbp->nbfp,
+ nbp->nbfp_comb,
+ nbp->coulomb_tab,
+ adat->numTypes,
+ nbp->rcoulomb_sq,
+ nbp->rvdw_sq,
+ nbp->two_k_rf,
+ nbp->ewald_beta,
+ nbp->rlistOuter_sq,
+ nbp->sh_ewald,
+ nbp->epsfac,
+ nbp->ewaldcoeff_lj,
+ nbp->c_rf,
+ nbp->dispersion_shift,
+ nbp->repulsion_shift,
+ nbp->vdw_switch,
+ nbp->rvdw_switch,
+ nbp->sh_lj_ewald,
+ nbp->coulomb_tab_scale,
+ stepWork.computeVirial);
+}
+
+} // namespace Nbnxm
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Declares nbnxn sycl helper functions
+ *
+ * \ingroup module_nbnxm
+ */
+#ifndef GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H
+#define GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H
+
+// Forward declarations
+namespace gmx
+{
+enum class InteractionLocality;
+class StepWorkload;
+} // namespace gmx
+struct NbnxmGpu;
+
+namespace Nbnxm
+{
+using gmx::InteractionLocality;
+
+void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc);
+
+} // namespace Nbnxm
+
+#endif // GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * NBNXM SYCL kernels
+ *
+ * \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "nbnxm_sycl_kernel_pruneonly.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/utility/template_mp.h"
+
+#include "nbnxm_sycl_kernel_utils.h"
+#include "nbnxm_sycl_types.h"
+
+using cl::sycl::access::fence_space;
+using cl::sycl::access::mode;
+using cl::sycl::access::target;
+
+namespace Nbnxm
+{
+
+/*! \brief Prune-only kernel for NBNXM.
+ *
+ */
+template<bool haveFreshList>
+auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh,
+ DeviceAccessor<float4, mode::read> a_xq,
+ DeviceAccessor<float3, mode::read> a_shiftVec,
+ DeviceAccessor<nbnxn_cj4_t, mode::read_write> a_plistCJ4,
+ DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
+ DeviceAccessor<unsigned int, haveFreshList ? mode::write : mode::read> a_plistIMask,
+ const float rlistOuterSq,
+ const float rlistInnerSq,
+ const int numParts,
+ const int part)
+{
+ cgh.require(a_xq);
+ cgh.require(a_shiftVec);
+ cgh.require(a_plistCJ4);
+ cgh.require(a_plistSci);
+ cgh.require(a_plistIMask);
+
+ /* shmem buffer for i x+q pre-loading */
+ cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+ cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+
+ constexpr int warpSize = c_clSize * c_clSize / 2;
+
+ /* Somewhat weird behavior inherited from OpenCL.
+ * With clSize == 4, we use sub_group size of 16 (not enforced in OpenCL implementation, but chosen
+ * by the IGC compiler), however for data layout we consider it to be 8.
+ * Setting sub_group size to 8 slows down the prune-only kernel 1.5-2 times.
+ * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness,
+ * but it causes very poor performance.
+ */
+ constexpr int requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
+
+ /* Requirements:
+ * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
+ * to change). */
+ return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]]
+ {
+ const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
+ // thread/block/warp id-s
+ const unsigned tidxi = localId[0];
+ const unsigned tidxj = localId[1];
+ const int tidx = tidxj * c_clSize + tidxi;
+ const unsigned tidxz = localId[2];
+ const unsigned bidx = itemIdx.get_group(0);
+
+ const sycl_2020::sub_group sg = itemIdx.get_sub_group();
+ const unsigned widx = tidx / warpSize;
+
+ // my i super-cluster's index = sciOffset + current bidx * numParts + part
+ const nbnxn_sci_t nbSci = a_plistSci[bidx * numParts + part];
+ const int sci = nbSci.sci; /* super-cluster */
+ const int cij4Start = nbSci.cj4_ind_start; /* first ...*/
+ const int cij4End = nbSci.cj4_ind_end; /* and last index of j clusters */
+
+ if (tidxz == 0)
+ {
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += c_clSize)
+ {
+ /* Pre-load i-atom x and q into shared memory */
+ const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
+ const int ai = ci * c_clSize + tidxi;
+
+ /* We don't need q, but using float4 in shmem avoids bank conflicts.
+ (but it also wastes L2 bandwidth). */
+ const float4 xq = a_xq[ai];
+ const float3 shift = a_shiftVec[nbSci.shift];
+ const float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
+ sm_xq[tidxj + i][tidxi] = xi;
+ }
+ }
+ itemIdx.barrier(fence_space::local_space);
+
+ /* loop over the j clusters = seen by any of the atoms in the current super-cluster.
+ * The loop stride c_syclPruneKernelJ4Concurrency ensures that consecutive warps-pairs are
+ * assigned consecutive j4's entries. */
+ for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += c_syclPruneKernelJ4Concurrency)
+ {
+ unsigned imaskFull, imaskCheck, imaskNew;
+
+ if constexpr (haveFreshList)
+ {
+ /* Read the mask from the list transferred from the CPU */
+ imaskFull = a_plistCJ4[j4].imei[widx].imask;
+ /* We attempt to prune all pairs present in the original list */
+ imaskCheck = imaskFull;
+ imaskNew = 0;
+ }
+ else
+ {
+ /* Read the mask from the "warp-pruned" by rlistOuter mask array */
+ imaskFull = a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx];
+ /* Read the old rolling pruned mask, use as a base for new */
+ imaskNew = a_plistCJ4[j4].imei[widx].imask;
+ /* We only need to check pairs with different mask */
+ imaskCheck = (imaskNew ^ imaskFull);
+ }
+
+ if (imaskCheck)
+ {
+ for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+ {
+ if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
+ {
+ unsigned mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
+ // SYCL-TODO: Reevaluate prefetching methods
+ const int cj = a_plistCJ4[j4].cj[jm];
+ const int aj = cj * c_clSize + tidxj;
+
+ /* load j atom data */
+ const float4 tmp = a_xq[aj];
+ const float3 xj(tmp[0], tmp[1], tmp[2]);
+
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+ {
+ if (imaskCheck & mask_ji)
+ {
+ // load i-cluster coordinates from shmem
+ const float4 xi = sm_xq[i][tidxi];
+ // distance between i and j atoms
+ float3 rv(xi[0], xi[1], xi[2]);
+ rv -= xj;
+ const float r2 = norm2(rv);
+
+ /* If _none_ of the atoms pairs are in rlistOuter
+ * range, the bit corresponding to the current
+ * cluster-pair in imask gets set to 0. */
+ if (haveFreshList && !(sycl_2020::group_any_of(sg, r2 < rlistOuterSq)))
+ {
+ imaskFull &= ~mask_ji;
+ }
+ /* If any atom pair is within range, set the bit
+ * corresponding to the current cluster-pair. */
+ if (sycl_2020::group_any_of(sg, r2 < rlistInnerSq))
+ {
+ imaskNew |= mask_ji;
+ }
+ } // (imaskCheck & mask_ji)
+ /* shift the mask bit by 1 */
+ mask_ji += mask_ji;
+ } // (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+ } // (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
+ } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+
+ if constexpr (haveFreshList)
+ {
+ /* copy the list pruned to rlistOuter to a separate buffer */
+ a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx] = imaskFull;
+ }
+ /* update the imask with only the pairs up to rlistInner */
+ a_plistCJ4[j4].imei[widx].imask = imaskNew;
+ } // (imaskCheck)
+ } // for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += c_syclPruneKernelJ4Concurrency)
+ };
+}
+
+// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
+template<bool haveFreshList>
+class NbnxmKernelPruneOnlyName;
+
+template<bool haveFreshList, class... Args>
+cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
+ const int numSciInPart,
+ Args&&... args)
+{
+ // Should not be needed for SYCL2020.
+ using kernelNameType = NbnxmKernelPruneOnlyName<haveFreshList>;
+
+ /* Kernel launch config:
+ * - The thread block dimensions match the size of i-clusters, j-clusters,
+ * and j-cluster concurrency, in x, y, and z, respectively.
+ * - The 1D block-grid contains as many blocks as super-clusters.
+ */
+ const unsigned long numBlocks = numSciInPart;
+ const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, c_syclPruneKernelJ4Concurrency };
+ const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
+ const cl::sycl::nd_range<3> range{ globalSize, blockSize };
+
+ cl::sycl::queue q = deviceStream.stream();
+
+ cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
+ auto kernel = nbnxmKernelPruneOnly<haveFreshList>(cgh, std::forward<Args>(args)...);
+ cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+ });
+
+ return e;
+}
+
+template<class... Args>
+cl::sycl::event chooseAndLaunchNbnxmKernelPruneOnly(bool haveFreshList, Args&&... args)
+{
+ return gmx::dispatchTemplatedFunction(
+ [&](auto haveFreshList_) {
+ return launchNbnxmKernelPruneOnly<haveFreshList_>(std::forward<Args>(args)...);
+ },
+ haveFreshList);
+}
+
+void launchNbnxmKernelPruneOnly(NbnxmGpu* nb,
+ const InteractionLocality iloc,
+ const int numParts,
+ const int part,
+ const int numSciInPart)
+{
+ sycl_atomdata_t* adat = nb->atdat;
+ NBParamGpu* nbp = nb->nbparam;
+ gpu_plist* plist = nb->plist[iloc];
+ const bool haveFreshList = plist->haveFreshList;
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+ cl::sycl::event e = chooseAndLaunchNbnxmKernelPruneOnly(haveFreshList,
+ deviceStream,
+ numSciInPart,
+ adat->xq,
+ adat->shiftVec,
+ plist->cj4,
+ plist->sci,
+ plist->imask,
+ nbp->rlistOuter_sq,
+ nbp->rlistInner_sq,
+ numParts,
+ part);
+}
+
+} // namespace Nbnxm
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Declares nbnxn sycl helper functions
+ *
+ * \ingroup module_nbnxm
+ */
+#ifndef GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H
+#define GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H
+
+// Forward declarations
+namespace gmx
+{
+enum class InteractionLocality;
+}
+struct NbnxmGpu;
+
+namespace Nbnxm
+{
+using gmx::InteractionLocality;
+
+void launchNbnxmKernelPruneOnly(NbnxmGpu* nb,
+ const InteractionLocality iloc,
+ const int numParts,
+ const int part,
+ const int numSciInPart);
+
+} // namespace Nbnxm
+
+#endif // GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Helper functions and constants for SYCL NBNXM kernels
+ *
+ * \ingroup module_nbnxm
+ */
+#ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
+#define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
+
+#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/nbnxm/pairlistparams.h"
+
+namespace Nbnxm
+{
+
+#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
+# define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4
+#endif
+/*! \brief Macro defining default for the prune kernel's j4 processing concurrency.
+ *
+ * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
+ */
+static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
+
+/* Convenience constants */
+/*! \cond */
+// cluster size = number of atoms per cluster.
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
+// j-cluster size after split (4 in the current implementation).
+static constexpr int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
+// i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set.
+static constexpr unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
+
+// 1/sqrt(pi), same value as \c M_FLOAT_1_SQRTPI in other NB kernels.
+static constexpr float c_OneOverSqrtPi = 0.564189583547756F;
+// 1/6, same value as in other NB kernels.
+static constexpr float c_oneSixth = 0.16666667F;
+// 1/12, same value as in other NB kernels.
+static constexpr float c_oneTwelfth = 0.08333333F;
+/*! \endcond */
+
+/* The following functions are necessary because on some versions of Intel OpenCL RT, subgroups
+ * do not properly work (segfault or create subgroups of size 1) if used in kernels
+ * with non-1-dimensional workgroup. */
+//! \brief Convert 3D range to 1D
+static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d)
+{
+ return cl::sycl::range<1>(range3d.size());
+}
+
+//! \brief Convert 3D nd_range to 1D
+static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d)
+{
+ return cl::sycl::nd_range<1>(flattenRange(nd_range3d.get_global_range()),
+ flattenRange(nd_range3d.get_local_range()));
+}
+
+//! \brief Convert flattened 1D index to 3D
+template<int rangeX, int rangeY>
+static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d)
+{
+ constexpr unsigned rangeXY = rangeX * rangeY;
+ const unsigned id = id1d[0];
+ const unsigned z = id / rangeXY;
+ const unsigned xy = id % rangeXY;
+ return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
+}
+
+//! \brief Convenience wrapper to do atomic addition to a global buffer
+template<cl::sycl::access::mode Mode, class IndexType>
+static inline void atomicFetchAdd(DeviceAccessor<float, Mode> acc, const IndexType idx, const float val)
+{
+ if (cl::sycl::isnormal(val))
+ {
+ sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
+ fout_atomic(acc[idx]);
+ fout_atomic.fetch_add(val);
+ }
+}
+
+static inline float shuffleDown(float var, unsigned int delta, sycl_2020::sub_group sg)
+{
+ return sg.shuffle_down(var, delta);
+}
+
+static inline float shuffleUp(float var, unsigned int delta, sycl_2020::sub_group sg)
+{
+ return sg.shuffle_up(var, delta);
+}
+
+} // namespace Nbnxm
+
+#endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Data types used internally in the nbnxm_sycl module.
+ *
+ * \ingroup module_nbnxm
+ */
+
+#ifndef NBNXM_SYCL_TYPES_H
+#define NBNXM_SYCL_TYPES_H
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/devicebuffer_sycl.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/gpu_utils/syclutils.h"
+#include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/enumerationhelpers.h"
+
+/*! \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ * Since SYCL buffers already have host-side storage, this is a bit redundant.
+ * But it allows prefetching of the data from GPU, and brings GPU backends closer together.
+ */
+struct nb_staging_t
+{
+ //! LJ energy
+ float* e_lj = nullptr;
+ //! electrostatic energy
+ float* e_el = nullptr;
+ //! shift forces
+ float3* fshift = nullptr;
+};
+
+/** \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
+struct sycl_atomdata_t
+{
+ //! number of atoms
+ int natoms;
+ //! number of local atoms
+ int natoms_local; //
+ //! allocation size for the atom data (xq, f)
+ int numAlloc;
+
+ //! atom coordinates + charges, size \ref natoms
+ DeviceBuffer<float4> xq;
+ //! force output array, size \ref natoms
+ DeviceBuffer<float3> f;
+
+ //! LJ energy output, size 1
+ DeviceBuffer<float> eLJ;
+ //! Electrostatics energy input, size 1
+ DeviceBuffer<float> eElec;
+
+ //! shift forces
+ DeviceBuffer<float3> fShift;
+
+ //! number of atom types
+ int numTypes;
+ //! atom type indices, size \ref natoms
+ DeviceBuffer<int> atomTypes;
+ //! sqrt(c6),sqrt(c12) size \ref natoms
+ DeviceBuffer<float2> ljComb;
+
+ //! shifts
+ DeviceBuffer<float3> shiftVec;
+ //! true if the shift vector has been uploaded
+ bool shiftVecUploaded;
+};
+
+class GpuEventSynchronizer;
+
+/*! \internal
+ * \brief Main data structure for SYCL nonbonded force calculations.
+ */
+struct NbnxmGpu
+{
+ /*! \brief GPU device context.
+ *
+ * \todo Make it constant reference, once NbnxmGpu is a proper class.
+ */
+ const DeviceContext* deviceContext_;
+ /*! \brief true if doing both local/non-local NB work on GPU */
+ bool bUseTwoStreams = false;
+ /*! \brief true indicates that the nonlocal_done event was enqueued */
+ bool bNonLocalStreamActive = false;
+ /*! \brief atom data */
+ sycl_atomdata_t* atdat = nullptr;
+ /*! \brief f buf ops cell index mapping */
+
+ NBParamGpu* nbparam = nullptr;
+ /*! \brief pair-list data structures (local and non-local) */
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };
+ /*! \brief staging area where fshift/energies get downloaded. Will be removed in SYCL. */
+ nb_staging_t nbst;
+ /*! \brief local and non-local GPU streams */
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
+
+ /*! \brief True if event-based timing is enabled. Always false for SYCL. */
+ bool bDoTime = false;
+ /*! \brief Dummy timers. */
+ Nbnxm::gpu_timers_t* timers = nullptr;
+ /*! \brief Dummy timing data. */
+ gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
+
+ //! true when a pair-list transfer has been done at this step
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> didPairlistH2D = { { false } };
+ //! true when we we did pruning on this step
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> didPrune = { { false } };
+ //! true when we did rolling pruning (at the previous step)
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> didRollingPrune = { { false } };
+
+ /*! \brief Events used for synchronization. Would be deprecated in SYCL. */
+ /*! \{ */
+ /*! \brief Event triggered when the non-local non-bonded
+ * kernel is done (and the local transfer can proceed) */
+ GpuEventSynchronizer nonlocal_done;
+ /*! \brief Event triggered when the tasks issued in the local
+ * stream that need to precede the non-local force or buffer
+ * operation calculations are done (e.g. f buffer 0-ing, local
+ * x/q H2D, buffer op initialization in local stream that is
+ * required also by nonlocal stream ) */
+ GpuEventSynchronizer misc_ops_and_local_H2D_done;
+ /*! \} */
+
+ /*! \brief True if there is work for the current domain in the
+ * respective locality.
+ *
+ * This includes local/nonlocal GPU work, either bonded or
+ * nonbonded, scheduled to be executed in the current
+ * domain. As long as bonded work is not split up into
+ * local/nonlocal, if there is bonded GPU work, both flags
+ * will be true. */
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork = { { false } };
+
+ /*! \brief Pointer to event synchronizer triggered when the local
+ * GPU buffer ops / reduction is complete. Would be deprecated in SYCL.
+ *
+ * \note That the synchronizer is managed outside of this module
+ * in StatePropagatorDataGpu.
+ */
+ GpuEventSynchronizer* localFReductionDone = nullptr;
+
+ /*! \brief Event triggered when non-local coordinate buffer
+ * has been copied from device to host. Would be deprecated in SYCL. */
+ GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr;
+};
+
+#endif /* NBNXM_SYCL_TYPES_H */
# Both OpenCL (from JIT) and ThreadSanitizer (from how it
# checks) can take signficantly more time than other
# configurations.
- if (GMX_GPU_OPENCL)
+ if (GMX_GPU_OPENCL OR GMX_GPU_SYCL)
set(_timeout 240)
elseif (${CMAKE_BUILD_TYPE} STREQUAL TSAN)
set(_timeout 300)