Decouple GPU force buffer management from buffer ops in NBNXM

[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda.cu
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 63051b7c4314f77632157b6c011d0549d9cd03ae..6a17ee6a9b14386238cae8a4b4dfb4acead899ee 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -859,22 +859,22 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
  
  /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
  void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
+                               DeviceBuffer<float>              totalForcesDevice,
                                 gmx_nbnxn_gpu_t                 *nb,
-                               void                            *fPmeDevicePtr,
+                               void                            *pmeForcesDevice,
                                 GpuEventSynchronizer            *pmeForcesReady,
                                 int                              atomStart,
-                               int                              nAtoms,
+                               int                              numAtoms,
                                 bool                             useGpuFPmeReduction,
                                 bool                             accumulateForce)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-    cu_atomdata_t            *adat      = nb->atdat;
-    bool                      addPmeF   = useGpuFPmeReduction;
+    const InteractionLocality iLocality     = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream        = nb->stream[iLocality];
+    cu_atomdata_t            *adat          = nb->atdat;
  
-    if (addPmeF)
+    if (useGpuFPmeReduction)
      {
          //Stream must wait for PME force completion
          pmeForcesReady->enqueueWaitEvent(stream);
@@ -886,7 +886,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
      config.blockSize[0]     = c_bufOpsThreadsPerBlock;
      config.blockSize[1]     = 1;
      config.blockSize[2]     = 1;
-    config.gridSize[0]      = ((nAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
+    config.gridSize[0]      = ((numAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
      config.gridSize[1]      = 1;
      config.gridSize[2]      = 1;
      config.sharedMemorySize = 0;
@@ -896,30 +896,35 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
          nbnxn_gpu_add_nbat_f_to_f_kernel<true, false> :
          nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
  
-    if (addPmeF)
+    if (useGpuFPmeReduction)
      {
          kernelFn = accumulateForce ?
              nbnxn_gpu_add_nbat_f_to_f_kernel<true, true> :
              nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
      }
  
-    const float3     *d_f     = adat->f;
-    float3           *d_fNB   = (float3*) nb->frvec;
-    const float3     *d_fPme  = (float3*) fPmeDevicePtr;
-    const int        *d_cell  = nb->cell;
+    const float3     *d_fNB    = adat->f;
+    const float3     *d_fPme   = (float3*) pmeForcesDevice;
+    float3           *d_fTotal = (float3*) totalForcesDevice;
+    const int        *d_cell   = nb->cell;
  
      const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
-                                                               &d_f,
-                                                               &d_fPme,
                                                                 &d_fNB,
+                                                               &d_fPme,
+                                                               &d_fTotal,
                                                                 &d_cell,
                                                                 &atomStart,
-                                                               &nAtoms);
+                                                               &numAtoms);
  
      launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
  
  }
  
+DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb)
+{
+    return reinterpret_cast< DeviceBuffer<float> >(nb->frvec);
+}
+
  void nbnxn_launch_copy_f_to_gpu(const AtomLocality               atomLocality,
                                  const Nbnxm::GridSet            &gridSet,
                                  gmx_nbnxn_gpu_t                 *nb,