Position buffer ops in CUDA

[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda.cu
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 250323b280a21abae6df836d96ef3aee17a294ef..19718a45d20f9771ca1ebbdae0acc816f049ade4 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -59,8 +59,10 @@
  #include "gromacs/nbnxm/gpu_common.h"
  #include "gromacs/nbnxm/gpu_common_utils.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh"
  #include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/gmxassert.h"
@@ -271,6 +273,39 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
      return shmem;
  }
  
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ *
+ *  As the point where the local stream tasks can be considered complete happens
+ *  at the same call point where the nonlocal stream should be synced with the
+ *  the local, this function recrds the event if called with the local stream as
+ *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
+ */
+static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
+                                        const InteractionLocality interactionLocality)
+{
+    cudaStream_t stream  = nb->stream[interactionLocality];
+
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    if (nb->bUseTwoStreams)
+    {
+        if (interactionLocality == InteractionLocality::Local)
+        {
+            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+        }
+        else
+        {
+            cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+        }
+    }
+}
+
  /*! \brief Launch asynchronously the xq buffer host to device copy. */
  void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
                          const nbnxn_atomdata_t *nbatom,
@@ -319,15 +354,14 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
          adat_len    = adat->natoms - adat->natoms_local;
      }
  
+    /* HtoD x, q */
      /* beginning of timed HtoD section */
      if (bDoTime)
      {
          t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
      }
  
-    /* HtoD x, q */
-    cu_copy_H2D_async(adat->xq + adat_begin,
-                      static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
+    cu_copy_H2D_async(adat->xq + adat_begin, static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
                        adat_len * sizeof(*adat->xq), stream);
  
      if (bDoTime)
@@ -341,19 +375,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
         This wait needs to precede any PP tasks, bonded or nonbonded, that may
         compute on interactions between local and nonlocal atoms.
       */
-    if (nb->bUseTwoStreams)
-    {
-        if (iloc == InteractionLocality::Local)
-        {
-            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
-            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
-        }
-        else
-        {
-            cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
-            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
-        }
-    }
+    insertNonlocalGpuDependency(nb, iloc);
  }
  
  /*! As we execute nonbonded workload in separate streams, before launching
@@ -715,4 +737,122 @@ void cuda_set_cacheconfig()
      }
  }
  
+/* X buffer operations on GPU: performs conversion from rvec to nb format. */
+//TODO improve variable naming for g
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
+                           int                              g,
+                           bool                             FillLocal,
+                           gmx_nbnxn_gpu_t                 *nb,
+                           void                            *xPmeDevicePtr,
+                           int                              maxAtomsInColumn,
+                           const Nbnxm::AtomLocality        locality,
+                           const rvec                      *x)
+{
+    cu_atomdata_t             *adat    = nb->atdat;
+    bool                       bDoTime = nb->bDoTime;
+
+    const Nbnxm::Grid         &grid       = gridSet.grids()[g];
+
+    //TODO improve naming here. Either use the getters straight or
+    //using variables with about the same name as the getters (perhaps
+    //nColumns, cellOffset, nNatomsPerCell)
+    const int                  ncxy            = grid.numColumns();
+    const int                  cell0           = grid.cellOffset();
+    const int                  na_sc           = grid.numAtomsPerCell();
+    Nbnxm::InteractionLocality interactionLoc  = Nbnxm::InteractionLocality::Local;
+    int nCopyAtoms                             = gridSet.numRealAtomsLocal();
+    int copyAtomStart                          = 0;
+
+    if (locality == Nbnxm::AtomLocality::NonLocal)
+    {
+        interactionLoc          = Nbnxm::InteractionLocality::NonLocal;
+        nCopyAtoms              = gridSet.numRealAtomsTotal()-gridSet.numRealAtomsLocal();
+        copyAtomStart           = gridSet.numRealAtomsLocal();
+    }
+
+    cudaStream_t   stream  = nb->stream[interactionLoc];
+
+    // FIXME: need to either let the local stream get to the
+    // insertNonlocalGpuDependency call or call it separately here
+    if (nCopyAtoms == 0) // empty domain
+    {
+        if (interactionLoc == Nbnxm::InteractionLocality::Local)
+        {
+            insertNonlocalGpuDependency(nb, interactionLoc);
+        }
+        return;
+    }
+
+    const rvec *d_x;
+
+    // copy of coordinates will be required if null pointer has been
+    // passed to function
+    // TODO improve this mechanism
+    bool        copyCoord = (xPmeDevicePtr == nullptr);
+
+    // copy X-coordinate data to device
+    if (copyCoord)
+    {
+        if (bDoTime)
+        {
+            nb->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+        }
+
+        // FIXME: use copyToDeviceBuffer wrapper
+        // There still exist issues with host buffer not being pinned
+        // and another problem with wrong size being picked up by API
+        // auto devicePtr = &nb->xrvec[copyAtomStart][0];
+        // copyToDeviceBuffer(&devicePtr, &x[copyAtomStart][0], 0, nCopyAtoms,
+        //                    stream, GpuApiCallBehavior::Async, nullptr);
+        cudaError_t stat = cudaMemcpyAsync(&nb->xrvec[copyAtomStart][0], &x[copyAtomStart][0],
+                                           nCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice, stream);
+        CU_RET_ERR(stat, "cudaMemcpy failed on nb->xrvec");
+
+
+        if (bDoTime)
+        {
+            nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+        }
+
+        d_x = nb->xrvec;
+    }
+    else //coordinates have already been copied by PME stream
+    {
+        d_x = (rvec*) xPmeDevicePtr;
+    }
+
+    /* launch kernel on GPU */
+    const int          threadsPerBlock = 128;
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = ((maxAtomsInColumn+1)+threadsPerBlock-1)/threadsPerBlock;
+    config.gridSize[1]      = ncxy;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = stream;
+
+    auto       kernelFn     = nbnxn_gpu_x_to_nbat_x_kernel;
+    float     *xqPtr        = &(adat->xq->x);
+    const int *abufops      = nb->abufops;
+    const int *nabufopsPtr  = nb->nabufops[locality];
+    const int *cxybufopsPtr = nb->cxybufops[locality];
+    const auto kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
+                                                        &ncxy,
+                                                        &xqPtr,
+                                                        &g,
+                                                        &FillLocal,
+                                                        &d_x,
+                                                        &abufops,
+                                                        &nabufopsPtr,
+                                                        &cxybufopsPtr,
+                                                        &cell0,
+                                                        &na_sc);
+    launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
+
+    insertNonlocalGpuDependency(nb, interactionLoc);
+}
+
  } // namespace Nbnxm