Fixes for F buffer ops change
authorSzilárd Páll <pall.szilard@gmail.com>
Wed, 10 Jul 2019 18:48:16 +0000 (20:48 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 31 Jul 2019 09:19:44 +0000 (11:19 +0200)
Change-Id: I8880ff91156c983560ee2173748faa8a4b189817

src/gromacs/mdlib/sim_util.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index a1f2f5a773d54bea43962b92e45348e83ea51d86..e525fad2974fb2fda89edbbc6b9bca158520d32c 100644 (file)
@@ -1454,7 +1454,7 @@ void do_force(FILE                                     *fplog,
         {
             if (useGpuFBufOps == BufferOpsUseGpu::True)
             {
-                nbv->wait_stream_gpu(Nbnxm::AtomLocality::NonLocal);
+                nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
             }
             dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
         }
@@ -1526,6 +1526,11 @@ void do_force(FILE                                     *fplog,
     if (bUseOrEmulGPU && !alternateGpuWait)
     {
 
+        // TODO: move these steps as early as possible:
+        // - CPU f H2D should be as soon as all CPU-side forces are done
+        // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
+        //   before the next CPU task that consumes the forces: vsite spread or update)
+        //
         if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
         {
             nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
@@ -1535,7 +1540,7 @@ void do_force(FILE                                     *fplog,
         if (useGpuFBufOps == BufferOpsUseGpu::True)
         {
             nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
-            nbv->wait_stream_gpu(Nbnxm::AtomLocality::Local);
+            nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
         }
     }
 
index e48713c7ce5697134f33c4c06c56ade402a68893..ec16b1b01ca41c1379e80cb86f973c8735c9ca9d 100644 (file)
@@ -758,6 +758,9 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                            int                              gridId,
                            int                              numColumnsMax)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+    GMX_ASSERT(x,  "Need a valid x pointer");
+
     cu_atomdata_t             *adat    = nb->atdat;
     bool                       bDoTime = nb->bDoTime;
 
@@ -812,6 +815,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     {
         d_x = (rvec*) xPmeDevicePtr;
     }
+    GMX_ASSERT(d_x,  "Need a valid d_x pointer");
 
     /* launch kernel on GPU */
 
@@ -853,10 +857,12 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                  atomLocality,
                                int                                 nAtoms,
                                GpuBufferOpsAccumulateForce         accumulateForce)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-    cu_atomdata_t       *adat    = nb->atdat;
-    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
-        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream    = nb->stream[iLocality];
+
+    cu_atomdata_t            *adat    = nb->atdat;
 
     /* launch kernel */
 
@@ -892,12 +898,16 @@ void nbnxn_launch_copy_f_to_gpu(const AtomLocality               atomLocality,
                                 gmx_nbnxn_gpu_t                 *nb,
                                 rvec                            *f)
 {
-    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
-        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
-    bool                 bDoTime = nb->bDoTime;
-    cu_timers_t         *t       = nb->timers;
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+    GMX_ASSERT(f,  "Need a valid f pointer");
 
-    int                  atomStart = 0, nAtoms = 0;
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream    = nb->stream[iLocality];
+
+    bool                      bDoTime = nb->bDoTime;
+    cu_timers_t              *t       = nb->timers;
+
+    int                       atomStart = 0, nAtoms = 0;
 
     nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
 
@@ -927,12 +937,15 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
                                   gmx_nbnxn_gpu_t                 *nb,
                                   rvec                            *f)
 {
-    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
-        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
-    bool                 bDoTime = nb->bDoTime;
-    cu_timers_t         *t       = nb->timers;
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+    GMX_ASSERT(f,  "Need a valid f pointer");
 
-    int                  atomStart = 0, nAtoms = 0;
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream    = nb->stream[iLocality];
+
+    bool                      bDoTime = nb->bDoTime;
+    cu_timers_t              *t       = nb->timers;
+    int                       atomStart, nAtoms;
 
     nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
 
@@ -941,6 +954,7 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
         t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
     }
 
+    GMX_ASSERT(nb->frvec,  "Need a valid nb->frvec pointer");
     rvec       *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
     rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
     //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
@@ -957,12 +971,14 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
     return;
 }
 
-void nbnxn_wait_stream_gpu(const AtomLocality      gmx_unused atomLocality,
-                           gmx_nbnxn_gpu_t                   *nb)
+void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
+                                        gmx_nbnxn_gpu_t                   *nb)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
 
-    cudaStream_t         stream  = atomLocality == AtomLocality::Local ?
-        nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+    cudaStream_t              stream    = nb->stream[iLocality];
 
     cudaStreamSynchronize(stream);
 
index f7208272b57bad7ac7efe03c790a34272c98d8b2..681dcf2d8bbc8c36ee72743ca25b25d5810ccbbc 100644 (file)
@@ -248,9 +248,9 @@ void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocali
                                  f);
 }
 
-void nonbonded_verlet_t::wait_stream_gpu(const Nbnxm::AtomLocality locality)
+void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
 {
-    nbnxn_wait_stream_gpu(locality, gpu_nbv);
+    nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
 }
 
 /*! \endcond */
index 1e177a57a0b4c0588c59f2f4e9d2c2937787a5ef..7a6491a4aac4538d34e0819d022b94a12c4fdbce 100644 (file)
@@ -328,8 +328,11 @@ struct nonbonded_verlet_t
         /*! \brief D2H transfer of force buffer*/
         void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
 
-        /*! \brief Host sync on device stream given by locality */
-        void wait_stream_gpu(Nbnxm::AtomLocality locality);
+        /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
+         *
+         * FIXME: need more details: when should be called / after which operation, etc.
+         */
+        void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
 
         //! Return the kernel setup
         const Nbnxm::KernelSetup &kernelSetup() const
index 3295fdb1f95add263bb728c3752f965eb78c3117..0749c5e112235e509a65f33cc17ab02c90597ac6 100644 (file)
@@ -300,8 +300,8 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality      gmx_unused  atomLocali
 
 /*! \brief Wait for GPU stream to complete */
 CUDA_FUNC_QUALIFIER
-void nbnxn_wait_stream_gpu(const AtomLocality      gmx_unused  atomLocality,
-                           gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM
+void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused  atomLocality,
+                                        gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM
 
 
 }     // namespace Nbnxm