GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
+
+/*! \brief Get pointer to device copy of coordinate data. */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr)
#endif
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
+
+void *pme_gpu_get_device_x(const gmx_pme_t *pme)
+{
+ if (!pme || !pme_gpu_active(pme))
+ {
+ return nullptr;
+ }
+ return pme_gpu_get_kernelparam_coordinates(pme->gpu);
+}
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, h_coordinatesFloat,
0, pmeGpu->kernelParams->atoms.nAtoms * DIM,
pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
+ // Remove after adding the required event-based sync between the above H2D and the transform kernel
+ pme_gpu_synchronize(pmeGpu);
+
#endif
}
pme_gpu_copy_output_forces(pmeGpu);
}
+
+void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu)
+{
+ if (pmeGpu && pmeGpu->kernelParams)
+ {
+ return pmeGpu->kernelParams->atoms.d_coordinates;
+ }
+ else
+ {
+ return nullptr;
+ }
+
+}
const float *GPU_FUNC_ARGUMENT(h_grid)
) GPU_FUNC_TERM
+/*! \brief Return pointer to device copy of coordinate data. */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr)
/* The inlined convenience PME GPU status getters */
// PME-first ordering would suffice).
static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
+// environment variable to enable GPU buffer ops, to alow incremental and optional
+// introduction of this functionality.
+// TODO eventially tie this in with other existing GPU flags.
+static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+
+
static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
{
((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
+ const bool useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA));
+
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
* decomposition (and not during the previous step as usual).
&top->excls, step, nrnb);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
wallcycle_stop(wcycle, ewcNS);
+
+ if (useGpuXBufOps)
+ {
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local);
+ }
+
}
else
{
nbv->setCoordinates(Nbnxm::AtomLocality::Local, false,
- x.unpaddedArrayRef(), wcycle);
+ x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
}
if (bUseGPU)
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
- Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
- Nbnxm::AtomLocality::Local,
- ppForceWorkload->haveGpuBondedWork);
+ if (bNS || !useGpuXBufOps)
+ {
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
+ Nbnxm::AtomLocality::Local,
+ ppForceWorkload->haveGpuBondedWork);
+ }
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ // with X buffer ops offloaded to the GPU on all but the search steps
// bonded work not split into separate local and non-local, so with DD
// we can only launch the kernel after non-local coordinates have been received.
&top->excls, step, nrnb);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
+
+ if (useGpuXBufOps)
+ {
+
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal);
+ }
}
else
{
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false,
- x.unpaddedArrayRef(), wcycle);
+ x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+
}
if (bUseGPU)
{
wallcycle_start(wcycle, ewcLAUNCH_GPU);
- /* launch non-local nonbonded tasks on GPU */
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
- Nbnxm::AtomLocality::NonLocal,
- ppForceWorkload->haveGpuBondedWork);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ if (bNS || !useGpuXBufOps)
+ {
+ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
+ Nbnxm::AtomLocality::NonLocal,
+ ppForceWorkload->haveGpuBondedWork);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ }
if (ppForceWorkload->haveGpuBondedWork)
{
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
}
+ /* launch non-local nonbonded tasks on GPU */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
const Nbnxm::AtomLocality locality,
gmx_bool FillLocal,
const rvec *x,
- nbnxn_atomdata_t *nbat)
+ nbnxn_atomdata_t *nbat,
+ bool useGpu,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ void *xPmeDevicePtr)
{
int gridBegin = 0;
int gridEnd = 0;
nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
}
- const int nth = gmx_omp_nthreads_get(emntPairsearch);
+ const int nth = useGpu ? 1 : gmx_omp_nthreads_get(emntPairsearch);
#pragma omp parallel for num_threads(nth) schedule(static)
for (int th = 0; th < nth; th++)
for (int g = gridBegin; g < gridEnd; g++)
{
const Nbnxm::Grid &grid = gridSet.grids()[g];
+
+ int maxAtomsInColumn = 0;
+
const int numCellsXY = grid.numColumns();
const int cxy0 = (numCellsXY* th + nth - 1)/nth;
*/
na_fill = na;
}
- copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash,
- na, na_fill, x,
- nbat->XFormat, nbat->x().data(), ash);
+ if (useGpu)
+ {
+ // All columns will be processed in a single GPU kernel (below).
+ // We need to determine the maximum number of atoms in a column
+ maxAtomsInColumn = std::max(maxAtomsInColumn, na);
+ }
+ else
+ {
+ copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash,
+ na, na_fill, x,
+ nbat->XFormat, nbat->x().data(), ash);
+ }
+ }
+ if (useGpu)
+ {
+ nbnxn_gpu_x_to_nbat_x(gridSet,
+ g,
+ FillLocal,
+ gpu_nbv,
+ xPmeDevicePtr,
+ maxAtomsInColumn,
+ locality,
+ x);
}
}
}
#include "gromacs/utility/bitmask.h"
#include "gromacs/utility/real.h"
+#include "gpu_types.h"
#include "locality.h"
namespace gmx
Nbnxm::AtomLocality locality,
gmx_bool FillLocal,
const rvec *x,
- nbnxn_atomdata_t *nbat);
+ nbnxn_atomdata_t *nbat,
+ bool useGpu,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ void *xPmeDevicePtr);
//! Add the computed forces to \p f, an internal reduction might be performed as well
void reduceForces(nbnxn_atomdata_t *nbat,
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief
+ * CUDA kernel for GPU version of copy_rvec_to_nbat_real.
+ * Converts coordinate data from rvec to nb format.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ * \author Jon Vincent <jvincent@nvidia.com>
+ */
+
+#include "gromacs/nbnxm/nbnxm.h"
+
+/*! \brief CUDA kernel for transforming position coordinates from rvec to nbnxm layout.
+ *
+ * \param[in] ncxy extent of cell-level parallelism
+ * \param[out] xnb position buffer in nbnxm layout
+ * \param[in] g grid index
+ * \param[in] FillLocal boolean to specify if Fill Local is true
+ * \param[in] x position buffer
+ * \param[in] a atom index mapping stride between atoms in memory
+ * \param[in] na_all array of extents
+ * \param[in] cxy_ind array of cell indices
+ * \param[in] cell0 first cell
+ * \param[in] na_sc number of atoms per cell
+ */
+//TODO improve/simplify/document use of na_all and na_round
+__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int ncxy,
+ float *xnb,
+ int g,
+ bool FillLocal,
+ const rvec *x,
+ const int *a,
+ const int *na_all,
+ const int *cxy_ind,
+ int cell0,
+ int na_sc);
+
+__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int ncxy,
+ float *xnb,
+ int g,
+ bool FillLocal,
+ const rvec *x,
+ const int *a,
+ const int *na_all,
+ const int *cxy_ind,
+ int cell0,
+ int na_sc)
+{
+
+
+ const float farAway = -1000000;
+
+ /* map cell-level parallelism to y component of CUDA block index */
+ int cxy = blockIdx.y;
+
+ if (cxy < ncxy)
+ {
+
+ int na = na_all[cxy];
+ int a0 = (cell0 + cxy_ind[cxy])*na_sc;
+ int na_round;
+ if (g == 0 && FillLocal)
+ {
+ na_round =
+ (cxy_ind[cxy+1] - cxy_ind[cxy])*na_sc;
+ }
+ else
+ {
+ /* We fill only the real particle locations.
+ * We assume the filling entries at the end have been
+ * properly set before during pair-list generation.
+ */
+ na_round = na;
+ }
+
+ /* map parallelism within a cell to x component of CUDA block index linearized
+ * with threads within a block */
+ int i, j0;
+ i = blockIdx.x*blockDim.x+threadIdx.x;
+
+ j0 = a0*STRIDE_XYZQ;
+
+ /* perform conversion of each element */
+ if (i < na_round)
+ {
+ if (i < na)
+ {
+
+ xnb[j0+4*i] = x[a[a0+i]][XX];
+ xnb[j0+4*i+1] = x[a[a0+i]][YY];
+ xnb[j0+4*i+2] = x[a[a0+i]][ZZ];
+ }
+ else
+ {
+ xnb[j0+4*i] = farAway;
+ xnb[j0+4*i+1] = farAway;
+ xnb[j0+4*i+2] = farAway;
+ }
+ }
+ }
+
+}
#include "gromacs/nbnxm/gpu_common.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/gmxassert.h"
return shmem;
}
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ *
+ * As the point where the local stream tasks can be considered complete happens
+ * at the same call point where the nonlocal stream should be synced with the
+ * the local, this function recrds the event if called with the local stream as
+ * argument and inserts in the GPU stream a wait on the event on the nonlocal.
+ */
+static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb,
+ const InteractionLocality interactionLocality)
+{
+ cudaStream_t stream = nb->stream[interactionLocality];
+
+ /* When we get here all misc operations issued in the local stream as well as
+ the local xq H2D are done,
+ so we record that in the local stream and wait for it in the nonlocal one.
+ This wait needs to precede any PP tasks, bonded or nonbonded, that may
+ compute on interactions between local and nonlocal atoms.
+ */
+ if (nb->bUseTwoStreams)
+ {
+ if (interactionLocality == InteractionLocality::Local)
+ {
+ cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+ CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+ }
+ else
+ {
+ cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+ CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+ }
+ }
+}
+
/*! \brief Launch asynchronously the xq buffer host to device copy. */
void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb,
const nbnxn_atomdata_t *nbatom,
adat_len = adat->natoms - adat->natoms_local;
}
+ /* HtoD x, q */
/* beginning of timed HtoD section */
if (bDoTime)
{
t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
}
- /* HtoD x, q */
- cu_copy_H2D_async(adat->xq + adat_begin,
- static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
+ cu_copy_H2D_async(adat->xq + adat_begin, static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
adat_len * sizeof(*adat->xq), stream);
if (bDoTime)
This wait needs to precede any PP tasks, bonded or nonbonded, that may
compute on interactions between local and nonlocal atoms.
*/
- if (nb->bUseTwoStreams)
- {
- if (iloc == InteractionLocality::Local)
- {
- cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
- CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
- }
- else
- {
- cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
- CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
- }
- }
+ insertNonlocalGpuDependency(nb, iloc);
}
/*! As we execute nonbonded workload in separate streams, before launching
}
}
+/* X buffer operations on GPU: performs conversion from rvec to nb format. */
+//TODO improve variable naming for g
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
+ int g,
+ bool FillLocal,
+ gmx_nbnxn_gpu_t *nb,
+ void *xPmeDevicePtr,
+ int maxAtomsInColumn,
+ const Nbnxm::AtomLocality locality,
+ const rvec *x)
+{
+ cu_atomdata_t *adat = nb->atdat;
+ bool bDoTime = nb->bDoTime;
+
+ const Nbnxm::Grid &grid = gridSet.grids()[g];
+
+ //TODO improve naming here. Either use the getters straight or
+ //using variables with about the same name as the getters (perhaps
+ //nColumns, cellOffset, nNatomsPerCell)
+ const int ncxy = grid.numColumns();
+ const int cell0 = grid.cellOffset();
+ const int na_sc = grid.numAtomsPerCell();
+ Nbnxm::InteractionLocality interactionLoc = Nbnxm::InteractionLocality::Local;
+ int nCopyAtoms = gridSet.numRealAtomsLocal();
+ int copyAtomStart = 0;
+
+ if (locality == Nbnxm::AtomLocality::NonLocal)
+ {
+ interactionLoc = Nbnxm::InteractionLocality::NonLocal;
+ nCopyAtoms = gridSet.numRealAtomsTotal()-gridSet.numRealAtomsLocal();
+ copyAtomStart = gridSet.numRealAtomsLocal();
+ }
+
+ cudaStream_t stream = nb->stream[interactionLoc];
+
+ // FIXME: need to either let the local stream get to the
+ // insertNonlocalGpuDependency call or call it separately here
+ if (nCopyAtoms == 0) // empty domain
+ {
+ if (interactionLoc == Nbnxm::InteractionLocality::Local)
+ {
+ insertNonlocalGpuDependency(nb, interactionLoc);
+ }
+ return;
+ }
+
+ const rvec *d_x;
+
+ // copy of coordinates will be required if null pointer has been
+ // passed to function
+ // TODO improve this mechanism
+ bool copyCoord = (xPmeDevicePtr == nullptr);
+
+ // copy X-coordinate data to device
+ if (copyCoord)
+ {
+ if (bDoTime)
+ {
+ nb->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ }
+
+ // FIXME: use copyToDeviceBuffer wrapper
+ // There still exist issues with host buffer not being pinned
+ // and another problem with wrong size being picked up by API
+ // auto devicePtr = &nb->xrvec[copyAtomStart][0];
+ // copyToDeviceBuffer(&devicePtr, &x[copyAtomStart][0], 0, nCopyAtoms,
+ // stream, GpuApiCallBehavior::Async, nullptr);
+ cudaError_t stat = cudaMemcpyAsync(&nb->xrvec[copyAtomStart][0], &x[copyAtomStart][0],
+ nCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice, stream);
+ CU_RET_ERR(stat, "cudaMemcpy failed on nb->xrvec");
+
+
+ if (bDoTime)
+ {
+ nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ }
+
+ d_x = nb->xrvec;
+ }
+ else //coordinates have already been copied by PME stream
+ {
+ d_x = (rvec*) xPmeDevicePtr;
+ }
+
+ /* launch kernel on GPU */
+ const int threadsPerBlock = 128;
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = threadsPerBlock;
+ config.blockSize[1] = 1;
+ config.blockSize[2] = 1;
+ config.gridSize[0] = ((maxAtomsInColumn+1)+threadsPerBlock-1)/threadsPerBlock;
+ config.gridSize[1] = ncxy;
+ config.gridSize[2] = 1;
+ config.sharedMemorySize = 0;
+ config.stream = stream;
+
+ auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
+ float *xqPtr = &(adat->xq->x);
+ const int *abufops = nb->abufops;
+ const int *nabufopsPtr = nb->nabufops[locality];
+ const int *cxybufopsPtr = nb->cxybufops[locality];
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
+ &ncxy,
+ &xqPtr,
+ &g,
+ &FillLocal,
+ &d_x,
+ &abufops,
+ &nabufopsPtr,
+ &cxybufopsPtr,
+ &cell0,
+ &na_sc);
+ launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
+
+ insertNonlocalGpuDependency(nb, interactionLoc);
+}
+
} // namespace Nbnxm
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlistsets.h"
#include "gromacs/pbcutil/ishift.h"
return reinterpret_cast<rvec *>(nb->atdat->fshift);
}
+/* Initialization for X buffer operations on GPU. */
+/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ const Nbnxm::AtomLocality locality)
+{
+ cudaError_t stat;
+ const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ?
+ InteractionLocality::Local : InteractionLocality::NonLocal);
+ cudaStream_t stream = gpu_nbv->stream[iloc];
+ bool bDoTime = gpu_nbv->bDoTime;
+ int gridBegin = 0, gridEnd = 0;
+
+
+ switch (locality)
+ {
+ case Nbnxm::AtomLocality::All:
+ gridBegin = 0;
+ gridEnd = gridSet.grids().size();
+ break;
+ case Nbnxm::AtomLocality::Local:
+ gridBegin = 0;
+ gridEnd = 1;
+ break;
+ case Nbnxm::AtomLocality::NonLocal:
+ gridBegin = 1;
+ gridEnd = gridSet.grids().size();
+ break;
+ case Nbnxm::AtomLocality::Count:
+ GMX_ASSERT(false, "Count is invalid locality specifier");
+ break;
+ }
+
+
+
+ for (int g = gridBegin; g < gridEnd; g++)
+ {
+
+ const Nbnxm::Grid &grid = gridSet.grids()[g];
+
+ //TODO improve naming here. Either use the getters straight or
+ //using variables with about the same name as the getters
+ const int ncxy = grid.numColumns();
+ const int *a = gridSet.atomIndices().data();
+ const int a_nalloc = gridSet.atomIndices().size();
+ const int *na_all = grid.cxy_na().data();
+ const int *cxy_ind = grid.cxy_ind().data();
+ const int natoms_nonlocal = gridSet.numRealAtomsTotal();
+
+ if (iloc == Nbnxm::InteractionLocality::Local)
+ {
+
+ if (gpu_nbv->xrvec)
+ {
+ freeDeviceBuffer(&gpu_nbv->xrvec);
+ }
+ //TODO replace with reallocateDeviceBuffer
+ allocateDeviceBuffer(&gpu_nbv->xrvec, natoms_nonlocal, nullptr);
+
+ if (gpu_nbv->abufops)
+ {
+ freeDeviceBuffer(&gpu_nbv->abufops);
+ }
+ //TODO replace with reallocateDeviceBuffer
+ allocateDeviceBuffer(&gpu_nbv->abufops, a_nalloc, nullptr);
+
+ if (a_nalloc > 0)
+ {
+ // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+ stat = cudaHostRegister((void*) a, a_nalloc*sizeof(int), cudaHostRegisterDefault);
+ CU_RET_ERR(stat, "cudaHostRegister failed on a");
+
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ }
+
+ copyToDeviceBuffer(&gpu_nbv->abufops, a, 0, a_nalloc, stream, GpuApiCallBehavior::Async, nullptr);
+
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ }
+
+ stat = cudaHostUnregister((void*) a);
+ CU_RET_ERR(stat, "cudaHostUnRegister failed on a");
+ }
+ }
+
+
+ if (gpu_nbv->nabufops[locality])
+ {
+ freeDeviceBuffer(&gpu_nbv->nabufops[locality]);
+
+ }
+ //TODO replace with reallocateDeviceBuffer
+ allocateDeviceBuffer(&gpu_nbv->nabufops[locality], ncxy, nullptr);
+
+ if (gpu_nbv->cxybufops[locality])
+ {
+ freeDeviceBuffer(&gpu_nbv->cxybufops[locality]);
+ }
+ //TODO replace with reallocateDeviceBuffer
+ allocateDeviceBuffer(&gpu_nbv->cxybufops[locality], ncxy, nullptr);
+
+ if (ncxy > 0)
+ {
+ // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+ stat = cudaHostRegister((void*) na_all, ncxy*sizeof(int), cudaHostRegisterDefault);
+ CU_RET_ERR(stat, "cudaHostRegister failed on na_all");
+
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ }
+
+ copyToDeviceBuffer(&gpu_nbv->nabufops[locality], na_all, 0, ncxy, stream, GpuApiCallBehavior::Async, nullptr);
+
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ }
+
+ stat = cudaHostUnregister((void*) na_all);
+ CU_RET_ERR(stat, "cudaHostUnRegister failed on na_all");
+
+ // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+ stat = cudaHostRegister((void*) cxy_ind, ncxy*sizeof(int), cudaHostRegisterDefault);
+ CU_RET_ERR(stat, "cudaHostRegister failed on cxy_ind");
+
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ }
+
+ copyToDeviceBuffer(&gpu_nbv->cxybufops[locality], cxy_ind, 0, ncxy, stream, GpuApiCallBehavior::Async, nullptr);
+
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ }
+
+ stat = cudaHostUnregister((void*) cxy_ind);
+ CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
+ }
+ }
+ return;
+}
+
} // namespace Nbnxm
const gmx_device_info_t *dev_info; /**< CUDA device information */
bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
cu_atomdata_t *atdat; /**< atom data */
+ rvec *xrvec; /**< coordinates in rvec format */
+ int *abufops; /**< x buf ops input buffer index mapping */
+ gmx::EnumerationArray<Nbnxm::AtomLocality, int *> nabufops; /**< x buf ops num of atoms (local and non-local) */
+ gmx::EnumerationArray<Nbnxm::AtomLocality, int *> cxybufops; /**< x buf ops cell index mapping (local and non-local) */
cu_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t *> plist; /**< pair-list data structures (local and non-local) */
nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
#endif
#if GMX_GPU == GMX_GPU_NONE
-typedef int gmx_nbnxn_gpu_t;
+using gmx_nbnxn_gpu_t = int;
#endif
#endif // !DOXYGEN
return cxy_na_[columnIndex];
}
+ /*! \brief Returns a view of the number of non-filler, atoms for each grid column
+ *
+ * \todo Needs a useful name. */
+ gmx::ArrayRef<const int> cxy_na() const
+ {
+ return cxy_na_;
+ }
+ /*! \brief Returns a view of the grid-local cell index for each grid column
+ *
+ * \todo Needs a useful name. */
+ gmx::ArrayRef<const int> cxy_ind() const
+ {
+ return cxy_ind_;
+ }
+
+ //! Returns the number of real atoms in the column
+ int numAtomsPerCell() const
+ {
+ return geometry_.numAtomsPerCell;
+ }
+
//! Returns the number of atoms in the column including padding
int paddedNumAtomsInColumn(int columnIndex) const
{
int cellOffset_;
/* Grid data */
- //! The number of, non-filler, atoms for each grid column
+ /*! \brief The number of, non-filler, atoms for each grid column.
+ *
+ * \todo Needs a useful name. */
std::vector<int> cxy_na_;
- //! The grid-local cell index for each grid column
+ /*! \brief The grid-local cell index for each grid column
+ *
+ * \todo Needs a useful name. */
std::vector<int> cxy_ind_;
//! The number of cluster for each cell
void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality locality,
const bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
+ bool useGpu,
+ void *xPmeDevicePtr,
gmx_wallcycle *wcycle)
{
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
-
nbnxn_atomdata_copy_x_to_nbat_x(pairSearch_->gridSet(), locality, fillLocal,
as_rvec_array(x.data()),
- nbat.get());
-
+ nbat.get(), useGpu, gpu_nbv, xPmeDevicePtr);
wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}
pairlistSets_->changePairlistRadii(rlistOuter, rlistInner);
}
+void
+nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality locality)
+{
+
+ nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(),
+ gpu_nbv,
+ locality);
+
+
+}
/*! \endcond */
void setCoordinates(Nbnxm::AtomLocality locality,
bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
+ bool useGpu,
+ void *xPmeDevicePtr,
gmx_wallcycle *wcycle);
+ //! Init for GPU version of setup coordinates in Nbnxm, for the given locality
+ void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality locality);
+
+
//! Returns a reference to the pairlist sets
const PairlistSets &pairlistSets() const
{
#include "gromacs/gpu_utils/gpu_macros.h"
#include "gromacs/math/vectypes.h"
+#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/real.h"
namespace Nbnxm
{
+class GridSet;
+
/*! \brief
* Launch asynchronously the xq buffer host to device copy.
*
GPU_FUNC_QUALIFIER
int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
+/*! \brief Initialization for X buffer operations on GPU.
+ * Called on the NS step and performs (re-)allocations and memory copies. !*/
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM
+
+/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
+ int gmx_unused g,
+ bool gmx_unused FillLocal,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ void gmx_unused *xPmeDevicePtr,
+ int gmx_unused na_round_max,
+ Nbnxm::AtomLocality gmx_unused locality,
+ const rvec gmx_unused *x) CUDA_FUNC_TERM
+
} // namespace Nbnxm
#endif