From: Andrey Alekseenko Date: Mon, 10 May 2021 13:22:53 +0000 (+0000) Subject: Make GPU X buffer ops platform-agnostic X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=fc78ed3dcbf009d13ec2fe9fb3cbea43e9c26a13;p=alexxy%2Fgromacs.git Make GPU X buffer ops platform-agnostic --- diff --git a/src/gromacs/nbnxm/CMakeLists.txt b/src/gromacs/nbnxm/CMakeLists.txt index 91738e6235..51aa880f47 100644 --- a/src/gromacs/nbnxm/CMakeLists.txt +++ b/src/gromacs/nbnxm/CMakeLists.txt @@ -66,21 +66,22 @@ file (GLOB NBNXM_SOURCES if(GMX_GPU_CUDA) add_subdirectory(cuda) - gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) + gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops.cpp) _gmx_add_files_to_property(CUDA_SOURCES nbnxm_gpu_data_mgmt.cpp + nbnxm_gpu_buffer_ops.cpp ) endif() if(GMX_GPU_OPENCL) add_subdirectory(opencl) set(NBNXM_OPENCL_KERNELS ${NBNXM_OPENCL_KERNELS} PARENT_SCOPE) - gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) + gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops_stubs.cpp) endif() if(GMX_GPU_SYCL) add_subdirectory(sycl) - gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) + gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops_stubs.cpp) _gmx_add_files_to_property(SYCL_SOURCES nbnxm_gpu_data_mgmt.cpp nbnxm.cpp) endif() diff --git a/src/gromacs/nbnxm/cuda/CMakeLists.txt b/src/gromacs/nbnxm/cuda/CMakeLists.txt index 15c047605a..9338868bdd 100644 --- a/src/gromacs/nbnxm/cuda/CMakeLists.txt +++ b/src/gromacs/nbnxm/cuda/CMakeLists.txt @@ -2,7 +2,7 @@ # This file is part of the GROMACS molecular simulation package. # # Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. -# Copyright (c) 2017,2019,2020, by the GROMACS development team, led by +# Copyright (c) 2017,2019,2020,2021, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -47,6 +47,7 @@ if(GMX_GPU_CUDA) nbnxm_cuda.cu nbnxm_cuda_data_mgmt.cu nbnxm_cuda_jit_support.cu + nbnxm_gpu_buffer_ops_internal.cu ${NBNXM_CUDA_KERNEL_SOURCES}) set(NBNXM_SOURCES ${NBNXM_SOURCES} ${NBNXM_CUDA_SOURCES} PARENT_SCOPE) endif() diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 4869756afe..ac5e50c8b1 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -71,7 +71,6 @@ #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/gmxassert.h" -#include "nbnxm_buffer_ops_kernels.cuh" #include "nbnxm_cuda_types.h" /***** The kernel declarations/definitions come here *****/ @@ -117,10 +116,6 @@ namespace Nbnxm { -//! Number of CUDA threads in a block -// TODO Optimize this through experimentation -constexpr static int c_bufOpsThreadsPerBlock = 128; - /*! Nonbonded kernel function pointer type */ typedef void (*nbnxn_cu_kfunc_ptr_t)(const NBAtomDataGpu, const NBParamGpu, const gpu_plist, bool); @@ -710,81 +705,4 @@ void cuda_set_cacheconfig() } } -/* X buffer operations on GPU: performs conversion from rvec to nb format. */ -void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, - NbnxmGpu* nb, - DeviceBuffer d_x, - GpuEventSynchronizer* xReadyOnDevice, - const Nbnxm::AtomLocality locality, - int gridId, - int numColumnsMax, - bool mustInsertNonLocalDependency) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - NBAtomDataGpu* adat = nb->atdat; - - const int numColumns = grid.numColumns(); - const int cellOffset = grid.cellOffset(); - const int numAtomsPerCell = grid.numAtomsPerCell(); - Nbnxm::InteractionLocality interactionLoc = atomToInteractionLocality(locality); - - const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc]; - - if (xReadyOnDevice != nullptr) - { - // We only need to wait on the first iteration of the loop - xReadyOnDevice->enqueueWaitEvent(deviceStream); - } - - int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin(); - // avoid empty kernel launch, skip to inserting stream dependency - if (numAtoms != 0) - { - // TODO: This will only work with CUDA - GMX_ASSERT(d_x, "Need a valid device pointer"); - - - KernelLaunchConfig config; - config.blockSize[0] = c_bufOpsThreadsPerBlock; - config.blockSize[1] = 1; - config.blockSize[2] = 1; - config.gridSize[0] = (grid.numCellsColumnMax() * numAtomsPerCell + c_bufOpsThreadsPerBlock - 1) - / c_bufOpsThreadsPerBlock; - config.gridSize[1] = numColumns; - config.gridSize[2] = 1; - GMX_ASSERT(config.gridSize[0] > 0, - "Can not have empty grid, early return above avoids this"); - config.sharedMemorySize = 0; - - auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel; - float4* d_xq = adat->xq; - float3* d_xFloat3 = asFloat3(d_x); - const int* d_atomIndices = nb->atomIndices; - const int* d_cxy_na = &nb->cxy_na[numColumnsMax * gridId]; - const int* d_cxy_ind = &nb->cxy_ind[numColumnsMax * gridId]; - const auto kernelArgs = prepareGpuKernelArguments(kernelFn, - config, - &numColumns, - &d_xq, - &d_xFloat3, - &d_atomIndices, - &d_cxy_na, - &d_cxy_ind, - &cellOffset, - &numAtomsPerCell); - launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs); - } - - if (mustInsertNonLocalDependency) - { - Nbnxm::nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); - } -} - -DeviceBuffer getGpuForces(NbnxmGpu* nb) -{ - return nb->atdat->f; -} - } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu similarity index 62% rename from src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh rename to src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu index 2df2a14ea2..577b2b4b92 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu @@ -1,7 +1,8 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. + * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -32,18 +33,21 @@ * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org. */ - -/*! \internal \file - * - * \brief - * CUDA kernels for GPU versions of copy_rvec_to_nbat_real and add_nbat_f_to_f. +/*! \file + * \brief Define CUDA kernel (and its wrapper) for transforming position coordinates from rvec to nbnxm layout. * * \author Alan Gray * \author Jon Vincent + * \author Szilard Pall */ +#include "gmxpre.h" + +#include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" -#include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/nbnxm/grid.h" +#include "gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h" +#include "gromacs/nbnxm/cuda/nbnxm_cuda_types.h" /*! \brief CUDA kernel for transforming position coordinates from rvec to nbnxm layout. * @@ -70,8 +74,7 @@ static __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns, int numAtomsPerCell) { - - const float farAway = -1000000.0f; + const float farAway = -1000000.0F; // Map cell-level parallelism to y component of CUDA block index. int cxy = blockIdx.y; @@ -102,3 +105,46 @@ static __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns, } } } + + +namespace Nbnxm +{ + +//! Number of CUDA threads in a block +// TODO Optimize this through experimentation +constexpr static int c_bufOpsThreadsPerBlock = 128; + +void launchNbnxmKernelTransformXToXq(const Grid& grid, + NbnxmGpu* nb, + DeviceBuffer d_x, + const DeviceStream& deviceStream, + const unsigned int numColumnsMax, + const int gridId) +{ + const int numColumns = grid.numColumns(); + const int cellOffset = grid.cellOffset(); + const int numAtomsPerCell = grid.numAtomsPerCell(); + + KernelLaunchConfig config; + config.blockSize[0] = c_bufOpsThreadsPerBlock; + config.blockSize[1] = 1; + config.blockSize[2] = 1; + config.gridSize[0] = (grid.numCellsColumnMax() * numAtomsPerCell + c_bufOpsThreadsPerBlock - 1) + / c_bufOpsThreadsPerBlock; + config.gridSize[1] = numColumns; + config.gridSize[2] = 1; + GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this"); + config.sharedMemorySize = 0; + + auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel; + float3* d_xFloat3 = asFloat3(d_x); + float4* d_xq = nb->atdat->xq; + const int* d_atomIndices = nb->atomIndices; + const int* d_cxy_na = &nb->cxy_na[numColumnsMax * gridId]; + const int* d_cxy_ind = &nb->cxy_ind[numColumnsMax * gridId]; + const auto kernelArgs = prepareGpuKernelArguments( + kernelFn, config, &numColumns, &d_xq, &d_xFloat3, &d_atomIndices, &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell); + launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs); +} + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index eb1becbf7d..a3633c2c7a 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -250,7 +250,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet, * \param[in] mustInsertNonLocalDependency Whether synchronization between local and non-local * streams should be added. Typically, true if and only if that is the last grid in gridset. */ -CUDA_FUNC_QUALIFIER +GPU_FUNC_QUALIFIER void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid, NbnxmGpu gmx_unused* gpu_nbv, DeviceBuffer gmx_unused d_x, @@ -258,7 +258,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid, gmx::AtomLocality gmx_unused locality, int gmx_unused gridId, int gmx_unused numColumnsMax, - bool gmx_unused mustInsertNonLocalDependency) CUDA_FUNC_TERM; + bool gmx_unused mustInsertNonLocalDependency) GPU_FUNC_TERM; /*! \brief Sync the nonlocal stream with dependent tasks in the local queue. * @@ -311,9 +311,9 @@ bool haveGpuShortRangeWork(const NbnxmGpu gmx_unused* nb, gmx::InteractionLocali * \param[in] nb The nonbonded data GPU structure * \returns A pointer to the force buffer in GPU memory */ -CUDA_FUNC_QUALIFIER +GPU_FUNC_QUALIFIER DeviceBuffer getGpuForces(NbnxmGpu gmx_unused* nb) - CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); + GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); } // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops.cpp b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops.cpp new file mode 100644 index 0000000000..00a15a0c38 --- /dev/null +++ b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops.cpp @@ -0,0 +1,103 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Common code for GPU buffer operations, namely the coordinate layout conversion + * + * \ingroup module_nbnxm + */ +#include "gmxpre.h" + +#include "config.h" + +#include "gromacs/gpu_utils/device_stream.h" +#if GMX_GPU_CUDA +# include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#endif +#include "gromacs/mdtypes/locality.h" +#include "gromacs/nbnxm/gridset.h" +#include "gromacs/nbnxm/nbnxm_gpu.h" +#include "gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h" +#if GMX_GPU_CUDA +# include "gromacs/nbnxm/cuda/nbnxm_cuda_types.h" +#endif +#include "gromacs/utility/exceptions.h" + +namespace Nbnxm +{ + +void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, + NbnxmGpu* nb, + DeviceBuffer d_x, + GpuEventSynchronizer* xReadyOnDevice, + const gmx::AtomLocality locality, + int gridId, + int numColumnsMax, + bool mustInsertNonLocalDependency) +{ + GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "nbnxn_gpu_x_to_nbat_x only supported with CUDA"); + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + gmx::InteractionLocality interactionLoc = gmx::atomToInteractionLocality(locality); + + const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc]; + + const int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin(); + + // Only insert wait on the first iteration of the loop. + if (xReadyOnDevice != nullptr) + { + xReadyOnDevice->enqueueWaitEvent(deviceStream); + } + + // avoid empty kernel launch, skip to inserting stream dependency + if (numAtoms != 0) + { + GMX_ASSERT(d_x, "Need a valid device pointer"); + launchNbnxmKernelTransformXToXq(grid, nb, d_x, deviceStream, numColumnsMax, gridId); + } + + if (mustInsertNonLocalDependency) + { + Nbnxm::nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); + } +} + +DeviceBuffer getGpuForces(NbnxmGpu* nb) +{ + return nb->atdat->f; +} + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h new file mode 100644 index 0000000000..dc91a7a299 --- /dev/null +++ b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h @@ -0,0 +1,68 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Wrapper for the backend-specific coordinate layout conversion functionality + * + * \ingroup module_nbnxm + */ +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" + +class DeviceStream; +class Grid; +struct NbnxmGpu; + +namespace Nbnxm +{ + +/*! \brief Launch coordinate layout conversion kernel + * + * \param[in] grid Pair-search grid. + * \param[in,out] nb Nbnxm main structure. + * \param[in] d_x Source atom coordinates. + * \param[in] deviceStream Device stream for kernel submission. + * \param[in] numColumnsMax Max. number of columns per grid for offset calculation in \p nb. + * \param[in] gridId Grid index for offset calculation in \p nb. + */ +void launchNbnxmKernelTransformXToXq(const Grid& grid, + NbnxmGpu* nb, + DeviceBuffer d_x, + const DeviceStream& deviceStream, + unsigned int numColumnsMax, + int gridId); + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_stubs.cpp b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_stubs.cpp new file mode 100644 index 0000000000..2d35afd3fe --- /dev/null +++ b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_stubs.cpp @@ -0,0 +1,73 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Common code for GPU buffer operations, namely the coordinate layout conversion + * + * \ingroup module_nbnxm + */ +#include "gmxpre.h" + +#include "config.h" + +#include "gromacs/nbnxm/nbnxm_gpu.h" + +#if !GMX_GPU_CUDA + +namespace Nbnxm +{ + +void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& /*grid*/, + NbnxmGpu* /*nb*/, + DeviceBuffer /*d_x*/, + GpuEventSynchronizer* /*xReadyOnDevice*/, + const gmx::AtomLocality /*locality*/, + int /*gridId*/, + int /*numColumnsMax*/, + bool /*mustInsertNonLocalDependency*/) +{ + GMX_RELEASE_ASSERT(false, "nbnxn_gpu_x_to_nbat_x only supported with CUDA"); +} + +DeviceBuffer getGpuForces(NbnxmGpu* /*nb*/) +{ + GMX_RELEASE_ASSERT(false, "getGpuForces only supported with CUDA"); + return DeviceBuffer{}; +} + +} // namespace Nbnxm + +#endif // !GMX_GPU_CUDA