#include "gromacs/nbnxm/gpu_common.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/gmxassert.h"
return shmem;
}
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ *
+ * As the point where the local stream tasks can be considered complete happens
+ * at the same call point where the nonlocal stream should be synced with the
+ * the local, this function recrds the event if called with the local stream as
+ * argument and inserts in the GPU stream a wait on the event on the nonlocal.
+ */
+static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb,
+ const InteractionLocality interactionLocality)
+{
+ cudaStream_t stream = nb->stream[interactionLocality];
+
+ /* When we get here all misc operations issued in the local stream as well as
+ the local xq H2D are done,
+ so we record that in the local stream and wait for it in the nonlocal one.
+ This wait needs to precede any PP tasks, bonded or nonbonded, that may
+ compute on interactions between local and nonlocal atoms.
+ */
+ if (nb->bUseTwoStreams)
+ {
+ if (interactionLocality == InteractionLocality::Local)
+ {
+ cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+ CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+ }
+ else
+ {
+ cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+ CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+ }
+ }
+}
+
/*! \brief Launch asynchronously the xq buffer host to device copy. */
void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb,
const nbnxn_atomdata_t *nbatom,
adat_len = adat->natoms - adat->natoms_local;
}
+ /* HtoD x, q */
/* beginning of timed HtoD section */
if (bDoTime)
{
t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
}
- /* HtoD x, q */
- cu_copy_H2D_async(adat->xq + adat_begin,
- static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
+ cu_copy_H2D_async(adat->xq + adat_begin, static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
adat_len * sizeof(*adat->xq), stream);
if (bDoTime)
This wait needs to precede any PP tasks, bonded or nonbonded, that may
compute on interactions between local and nonlocal atoms.
*/
- if (nb->bUseTwoStreams)
- {
- if (iloc == InteractionLocality::Local)
- {
- cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
- CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
- }
- else
- {
- cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
- CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
- }
- }
+ insertNonlocalGpuDependency(nb, iloc);
}
/*! As we execute nonbonded workload in separate streams, before launching
}
}
+/* X buffer operations on GPU: performs conversion from rvec to nb format. */
+//TODO improve variable naming for g
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
+ int g,
+ bool FillLocal,
+ gmx_nbnxn_gpu_t *nb,
+ void *xPmeDevicePtr,
+ int maxAtomsInColumn,
+ const Nbnxm::AtomLocality locality,
+ const rvec *x)
+{
+ cu_atomdata_t *adat = nb->atdat;
+ bool bDoTime = nb->bDoTime;
+
+ const Nbnxm::Grid &grid = gridSet.grids()[g];
+
+ //TODO improve naming here. Either use the getters straight or
+ //using variables with about the same name as the getters (perhaps
+ //nColumns, cellOffset, nNatomsPerCell)
+ const int ncxy = grid.numColumns();
+ const int cell0 = grid.cellOffset();
+ const int na_sc = grid.numAtomsPerCell();
+ Nbnxm::InteractionLocality interactionLoc = Nbnxm::InteractionLocality::Local;
+ int nCopyAtoms = gridSet.numRealAtomsLocal();
+ int copyAtomStart = 0;
+
+ if (locality == Nbnxm::AtomLocality::NonLocal)
+ {
+ interactionLoc = Nbnxm::InteractionLocality::NonLocal;
+ nCopyAtoms = gridSet.numRealAtomsTotal()-gridSet.numRealAtomsLocal();
+ copyAtomStart = gridSet.numRealAtomsLocal();
+ }
+
+ cudaStream_t stream = nb->stream[interactionLoc];
+
+ // FIXME: need to either let the local stream get to the
+ // insertNonlocalGpuDependency call or call it separately here
+ if (nCopyAtoms == 0) // empty domain
+ {
+ if (interactionLoc == Nbnxm::InteractionLocality::Local)
+ {
+ insertNonlocalGpuDependency(nb, interactionLoc);
+ }
+ return;
+ }
+
+ const rvec *d_x;
+
+ // copy of coordinates will be required if null pointer has been
+ // passed to function
+ // TODO improve this mechanism
+ bool copyCoord = (xPmeDevicePtr == nullptr);
+
+ // copy X-coordinate data to device
+ if (copyCoord)
+ {
+ if (bDoTime)
+ {
+ nb->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ }
+
+ // FIXME: use copyToDeviceBuffer wrapper
+ // There still exist issues with host buffer not being pinned
+ // and another problem with wrong size being picked up by API
+ // auto devicePtr = &nb->xrvec[copyAtomStart][0];
+ // copyToDeviceBuffer(&devicePtr, &x[copyAtomStart][0], 0, nCopyAtoms,
+ // stream, GpuApiCallBehavior::Async, nullptr);
+ cudaError_t stat = cudaMemcpyAsync(&nb->xrvec[copyAtomStart][0], &x[copyAtomStart][0],
+ nCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice, stream);
+ CU_RET_ERR(stat, "cudaMemcpy failed on nb->xrvec");
+
+
+ if (bDoTime)
+ {
+ nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ }
+
+ d_x = nb->xrvec;
+ }
+ else //coordinates have already been copied by PME stream
+ {
+ d_x = (rvec*) xPmeDevicePtr;
+ }
+
+ /* launch kernel on GPU */
+ const int threadsPerBlock = 128;
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = threadsPerBlock;
+ config.blockSize[1] = 1;
+ config.blockSize[2] = 1;
+ config.gridSize[0] = ((maxAtomsInColumn+1)+threadsPerBlock-1)/threadsPerBlock;
+ config.gridSize[1] = ncxy;
+ config.gridSize[2] = 1;
+ config.sharedMemorySize = 0;
+ config.stream = stream;
+
+ auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
+ float *xqPtr = &(adat->xq->x);
+ const int *abufops = nb->abufops;
+ const int *nabufopsPtr = nb->nabufops[locality];
+ const int *cxybufopsPtr = nb->cxybufops[locality];
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
+ &ncxy,
+ &xqPtr,
+ &g,
+ &FillLocal,
+ &d_x,
+ &abufops,
+ &nabufopsPtr,
+ &cxybufopsPtr,
+ &cell0,
+ &na_sc);
+ launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
+
+ insertNonlocalGpuDependency(nb, interactionLoc);
+}
+
} // namespace Nbnxm