if(GMX_GPU_CUDA)
add_subdirectory(cuda)
- gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
+ gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops.cpp)
_gmx_add_files_to_property(CUDA_SOURCES
nbnxm_gpu_data_mgmt.cpp
+ nbnxm_gpu_buffer_ops.cpp
)
endif()
if(GMX_GPU_OPENCL)
add_subdirectory(opencl)
set(NBNXM_OPENCL_KERNELS ${NBNXM_OPENCL_KERNELS} PARENT_SCOPE)
- gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
+ gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops_stubs.cpp)
endif()
if(GMX_GPU_SYCL)
add_subdirectory(sycl)
- gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
+ gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp nbnxm_gpu_buffer_ops_stubs.cpp)
_gmx_add_files_to_property(SYCL_SOURCES nbnxm_gpu_data_mgmt.cpp nbnxm.cpp)
endif()
# This file is part of the GROMACS molecular simulation package.
#
# Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
-# Copyright (c) 2017,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2017,2019,2020,2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
nbnxm_cuda.cu
nbnxm_cuda_data_mgmt.cu
nbnxm_cuda_jit_support.cu
+ nbnxm_gpu_buffer_ops_internal.cu
${NBNXM_CUDA_KERNEL_SOURCES})
set(NBNXM_SOURCES ${NBNXM_SOURCES} ${NBNXM_CUDA_SOURCES} PARENT_SCOPE)
endif()
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/gmxassert.h"
-#include "nbnxm_buffer_ops_kernels.cuh"
#include "nbnxm_cuda_types.h"
/***** The kernel declarations/definitions come here *****/
namespace Nbnxm
{
-//! Number of CUDA threads in a block
-// TODO Optimize this through experimentation
-constexpr static int c_bufOpsThreadsPerBlock = 128;
-
/*! Nonbonded kernel function pointer type */
typedef void (*nbnxn_cu_kfunc_ptr_t)(const NBAtomDataGpu, const NBParamGpu, const gpu_plist, bool);
}
}
-/* X buffer operations on GPU: performs conversion from rvec to nb format. */
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid,
- NbnxmGpu* nb,
- DeviceBuffer<gmx::RVec> d_x,
- GpuEventSynchronizer* xReadyOnDevice,
- const Nbnxm::AtomLocality locality,
- int gridId,
- int numColumnsMax,
- bool mustInsertNonLocalDependency)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- NBAtomDataGpu* adat = nb->atdat;
-
- const int numColumns = grid.numColumns();
- const int cellOffset = grid.cellOffset();
- const int numAtomsPerCell = grid.numAtomsPerCell();
- Nbnxm::InteractionLocality interactionLoc = atomToInteractionLocality(locality);
-
- const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
-
- if (xReadyOnDevice != nullptr)
- {
- // We only need to wait on the first iteration of the loop
- xReadyOnDevice->enqueueWaitEvent(deviceStream);
- }
-
- int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
- // avoid empty kernel launch, skip to inserting stream dependency
- if (numAtoms != 0)
- {
- // TODO: This will only work with CUDA
- GMX_ASSERT(d_x, "Need a valid device pointer");
-
-
- KernelLaunchConfig config;
- config.blockSize[0] = c_bufOpsThreadsPerBlock;
- config.blockSize[1] = 1;
- config.blockSize[2] = 1;
- config.gridSize[0] = (grid.numCellsColumnMax() * numAtomsPerCell + c_bufOpsThreadsPerBlock - 1)
- / c_bufOpsThreadsPerBlock;
- config.gridSize[1] = numColumns;
- config.gridSize[2] = 1;
- GMX_ASSERT(config.gridSize[0] > 0,
- "Can not have empty grid, early return above avoids this");
- config.sharedMemorySize = 0;
-
- auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
- float4* d_xq = adat->xq;
- float3* d_xFloat3 = asFloat3(d_x);
- const int* d_atomIndices = nb->atomIndices;
- const int* d_cxy_na = &nb->cxy_na[numColumnsMax * gridId];
- const int* d_cxy_ind = &nb->cxy_ind[numColumnsMax * gridId];
- const auto kernelArgs = prepareGpuKernelArguments(kernelFn,
- config,
- &numColumns,
- &d_xq,
- &d_xFloat3,
- &d_atomIndices,
- &d_cxy_na,
- &d_cxy_ind,
- &cellOffset,
- &numAtomsPerCell);
- launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
- }
-
- if (mustInsertNonLocalDependency)
- {
- Nbnxm::nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
- }
-}
-
-DeviceBuffer<Float3> getGpuForces(NbnxmGpu* nb)
-{
- return nb->atdat->f;
-}
-
} // namespace Nbnxm
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-
-/*! \internal \file
- *
- * \brief
- * CUDA kernels for GPU versions of copy_rvec_to_nbat_real and add_nbat_f_to_f.
+/*! \file
+ * \brief Define CUDA kernel (and its wrapper) for transforming position coordinates from rvec to nbnxm layout.
*
* \author Alan Gray <alang@nvidia.com>
* \author Jon Vincent <jvincent@nvidia.com>
+ * \author Szilard Pall <pall.szilard@gmail.com>
*/
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/typecasts.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
-#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/grid.h"
+#include "gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h"
+#include "gromacs/nbnxm/cuda/nbnxm_cuda_types.h"
/*! \brief CUDA kernel for transforming position coordinates from rvec to nbnxm layout.
*
int numAtomsPerCell)
{
-
- const float farAway = -1000000.0f;
+ const float farAway = -1000000.0F;
// Map cell-level parallelism to y component of CUDA block index.
int cxy = blockIdx.y;
}
}
}
+
+
+namespace Nbnxm
+{
+
+//! Number of CUDA threads in a block
+// TODO Optimize this through experimentation
+constexpr static int c_bufOpsThreadsPerBlock = 128;
+
+void launchNbnxmKernelTransformXToXq(const Grid& grid,
+ NbnxmGpu* nb,
+ DeviceBuffer<Float3> d_x,
+ const DeviceStream& deviceStream,
+ const unsigned int numColumnsMax,
+ const int gridId)
+{
+ const int numColumns = grid.numColumns();
+ const int cellOffset = grid.cellOffset();
+ const int numAtomsPerCell = grid.numAtomsPerCell();
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_bufOpsThreadsPerBlock;
+ config.blockSize[1] = 1;
+ config.blockSize[2] = 1;
+ config.gridSize[0] = (grid.numCellsColumnMax() * numAtomsPerCell + c_bufOpsThreadsPerBlock - 1)
+ / c_bufOpsThreadsPerBlock;
+ config.gridSize[1] = numColumns;
+ config.gridSize[2] = 1;
+ GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this");
+ config.sharedMemorySize = 0;
+
+ auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
+ float3* d_xFloat3 = asFloat3(d_x);
+ float4* d_xq = nb->atdat->xq;
+ const int* d_atomIndices = nb->atomIndices;
+ const int* d_cxy_na = &nb->cxy_na[numColumnsMax * gridId];
+ const int* d_cxy_ind = &nb->cxy_ind[numColumnsMax * gridId];
+ const auto kernelArgs = prepareGpuKernelArguments(
+ kernelFn, config, &numColumns, &d_xq, &d_xFloat3, &d_atomIndices, &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell);
+ launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
+}
+
+} // namespace Nbnxm
* \param[in] mustInsertNonLocalDependency Whether synchronization between local and non-local
* streams should be added. Typically, true if and only if that is the last grid in gridset.
*/
-CUDA_FUNC_QUALIFIER
+GPU_FUNC_QUALIFIER
void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
NbnxmGpu gmx_unused* gpu_nbv,
DeviceBuffer<gmx::RVec> gmx_unused d_x,
gmx::AtomLocality gmx_unused locality,
int gmx_unused gridId,
int gmx_unused numColumnsMax,
- bool gmx_unused mustInsertNonLocalDependency) CUDA_FUNC_TERM;
+ bool gmx_unused mustInsertNonLocalDependency) GPU_FUNC_TERM;
/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
*
* \param[in] nb The nonbonded data GPU structure
* \returns A pointer to the force buffer in GPU memory
*/
-CUDA_FUNC_QUALIFIER
+GPU_FUNC_QUALIFIER
DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu gmx_unused* nb)
- CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+ GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
} // namespace Nbnxm
#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Common code for GPU buffer operations, namely the coordinate layout conversion
+ *
+ * \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/gpu_utils/device_stream.h"
+#if GMX_GPU_CUDA
+# include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#endif
+#include "gromacs/mdtypes/locality.h"
+#include "gromacs/nbnxm/gridset.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
+#include "gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h"
+#if GMX_GPU_CUDA
+# include "gromacs/nbnxm/cuda/nbnxm_cuda_types.h"
+#endif
+#include "gromacs/utility/exceptions.h"
+
+namespace Nbnxm
+{
+
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid,
+ NbnxmGpu* nb,
+ DeviceBuffer<gmx::RVec> d_x,
+ GpuEventSynchronizer* xReadyOnDevice,
+ const gmx::AtomLocality locality,
+ int gridId,
+ int numColumnsMax,
+ bool mustInsertNonLocalDependency)
+{
+ GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "nbnxn_gpu_x_to_nbat_x only supported with CUDA");
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ gmx::InteractionLocality interactionLoc = gmx::atomToInteractionLocality(locality);
+
+ const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
+
+ const int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
+
+ // Only insert wait on the first iteration of the loop.
+ if (xReadyOnDevice != nullptr)
+ {
+ xReadyOnDevice->enqueueWaitEvent(deviceStream);
+ }
+
+ // avoid empty kernel launch, skip to inserting stream dependency
+ if (numAtoms != 0)
+ {
+ GMX_ASSERT(d_x, "Need a valid device pointer");
+ launchNbnxmKernelTransformXToXq(grid, nb, d_x, deviceStream, numColumnsMax, gridId);
+ }
+
+ if (mustInsertNonLocalDependency)
+ {
+ Nbnxm::nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
+ }
+}
+
+DeviceBuffer<Float3> getGpuForces(NbnxmGpu* nb)
+{
+ return nb->atdat->f;
+}
+
+} // namespace Nbnxm
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Wrapper for the backend-specific coordinate layout conversion functionality
+ *
+ * \ingroup module_nbnxm
+ */
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+
+class DeviceStream;
+class Grid;
+struct NbnxmGpu;
+
+namespace Nbnxm
+{
+
+/*! \brief Launch coordinate layout conversion kernel
+ *
+ * \param[in] grid Pair-search grid.
+ * \param[in,out] nb Nbnxm main structure.
+ * \param[in] d_x Source atom coordinates.
+ * \param[in] deviceStream Device stream for kernel submission.
+ * \param[in] numColumnsMax Max. number of columns per grid for offset calculation in \p nb.
+ * \param[in] gridId Grid index for offset calculation in \p nb.
+ */
+void launchNbnxmKernelTransformXToXq(const Grid& grid,
+ NbnxmGpu* nb,
+ DeviceBuffer<Float3> d_x,
+ const DeviceStream& deviceStream,
+ unsigned int numColumnsMax,
+ int gridId);
+
+} // namespace Nbnxm
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Common code for GPU buffer operations, namely the coordinate layout conversion
+ *
+ * \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/nbnxm/nbnxm_gpu.h"
+
+#if !GMX_GPU_CUDA
+
+namespace Nbnxm
+{
+
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& /*grid*/,
+ NbnxmGpu* /*nb*/,
+ DeviceBuffer<gmx::RVec> /*d_x*/,
+ GpuEventSynchronizer* /*xReadyOnDevice*/,
+ const gmx::AtomLocality /*locality*/,
+ int /*gridId*/,
+ int /*numColumnsMax*/,
+ bool /*mustInsertNonLocalDependency*/)
+{
+ GMX_RELEASE_ASSERT(false, "nbnxn_gpu_x_to_nbat_x only supported with CUDA");
+}
+
+DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu* /*nb*/)
+{
+ GMX_RELEASE_ASSERT(false, "getGpuForces only supported with CUDA");
+ return DeviceBuffer<gmx::RVec>{};
+}
+
+} // namespace Nbnxm
+
+#endif // !GMX_GPU_CUDA