Trigger synchronizer when local forces are ready
authorSzilárd Páll <pall.szilard@gmail.com>
Mon, 14 Oct 2019 17:19:03 +0000 (19:19 +0200)
committerSzilárd Páll <pall.szilard@gmail.com>
Tue, 15 Oct 2019 23:54:41 +0000 (01:54 +0200)
The sycnhronizer is created and managed in StatePropagatorDataGpu and is
passed to the nonbonded mdoule at the f buffer ops init.

Refs #2888 #3126

Change-Id: Ie9bf0b6cd8511fe282e377e48f3940e591db214c

src/gromacs/mdlib/sim_util.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index 2f2cade7db89105f95e6ebdfb1f2c6a00d43e450..5c40ff7f165c710376db8a40c833b6551d7b1280 100644 (file)
@@ -1155,7 +1155,8 @@ void do_force(FILE                                     *fplog,
         // NS step is also a virial step (on which f buf ops are deactivated).
         if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
         {
-            nbv->atomdata_init_add_nbat_f_to_f_gpu();
+            GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+            nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
         }
     }
     else if (!EI_TPI(inputrec->eI))
@@ -1544,6 +1545,7 @@ void do_force(FILE                                     *fplog,
                                                   pme_gpu_get_device_f(fr->pmedata),
                                                   dependencyList,
                                                   false, haveNonLocalForceContribInCpuBuffer);
+                // TODO: this should be conditional on whether GPU direct comm is used?
                 stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
             }
             else
index 7b1241e5e2ca56f1bdfee93a950964d11692d3e0..201a97055fdce93cb7df1b0b8de171590299cb44 100644 (file)
@@ -879,6 +879,11 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                          atomL
 
     launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
 
+    if (atomLocality == AtomLocality::Local)
+    {
+        GMX_ASSERT(nb->localFReductionDone != nullptr, "localFReductionDone has to be a valid pointer");
+        nb->localFReductionDone->markEvent(stream);
+    }
 }
 
 void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
index 71e25d23b110b905aa579168ccde66f0fe3f1056..67e7b581e7830c7fc1738e5bf4cafd45a3dd86d5 100644 (file)
@@ -963,13 +963,17 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
 }
 
 /* Initialization for F buffer operations on GPU. */
-void nbnxn_gpu_init_add_nbat_f_to_f(const int                *cell,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    int                       natoms_total)
+void nbnxn_gpu_init_add_nbat_f_to_f(const int                  *cell,
+                                    gmx_nbnxn_gpu_t            *gpu_nbv,
+                                    int                         natoms_total,
+                                    GpuEventSynchronizer* const localReductionDone)
 {
 
     cudaStream_t         stream  = gpu_nbv->stream[InteractionLocality::Local];
 
+    GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
+    gpu_nbv->localFReductionDone = localReductionDone;
+
     if (natoms_total > 0)
     {
         reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
index f3fc0e88527c6626c0a4021e9b03898f47da8d61..4667d63c4271559f7a2f2191289fa62b40ea4886 100644 (file)
@@ -274,6 +274,12 @@ struct gmx_nbnxn_cuda_t
                                                    any dependent task (e.g. transfer of coordinates
                                                    to the PME rank's GPU) can proceed. */
 
+    /*! \brief Pointer to event synchronizer triggered when the local GPU buffer ops / reduction is complete
+     *
+     * \note That the synchronizer is managed outside of this module in StatePropagatorDataGpu.
+     */
+    GpuEventSynchronizer *localFReductionDone;
+
     GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
                                                    non-local coordinate buffer has been
                                                    copied from device to host*/
index 3a796b504e67ada819190c51ffcd665e649e591e..52d2fecb1305d7c63aabb81608c7416756aa664f 100644 (file)
@@ -219,7 +219,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality
 }
 
 void
-nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
 {
 
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
@@ -229,7 +229,8 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
 
     Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(),
                                           gpu_nbv,
-                                          gridSet.numRealAtomsTotal());
+                                          gridSet.numRealAtomsTotal(),
+                                          localReductionDone);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
index 4378663b933cea0b1139e7596a12ce59cb7d2865..9e58003db287e4a7cfbc4ed3a521ce01b9491dad 100644 (file)
@@ -359,8 +359,11 @@ struct nonbonded_verlet_t
                                           bool                                        useGpuFPmeReduction,
                                           bool                                        accumulateForce);
 
-        /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
-        void atomdata_init_add_nbat_f_to_f_gpu();
+        /*! \brief Outer body of function to perform initialization for F buffer operations on GPU.
+         *
+         * \param localReductionDone     Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel.
+         */
+        void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
 
         /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
         void* get_x_on_device_event();
index 1061292b2da57a9dd50eb8db0332a2736eeaecbd..b0941b46b0b1ff8f98a9b82d8ae307f9e0d3cc03 100644 (file)
@@ -290,9 +290,10 @@ bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     gmx_unused *nb,
 
 /*! \brief Initialization for F buffer operations on GPU */
 CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
-                                    gmx_nbnxn_gpu_t         gmx_unused *gpu_nbv,
-                                    int                     gmx_unused  natoms_total) CUDA_FUNC_TERM;
+void nbnxn_gpu_init_add_nbat_f_to_f(const int                   gmx_unused *cell,
+                                    gmx_nbnxn_gpu_t             gmx_unused *gpu_nbv,
+                                    int                         gmx_unused  natoms_total,
+                                    GpuEventSynchronizer        gmx_unused *localReductionDone) CUDA_FUNC_TERM;
 
 /*! \brief Force buffer operations on GPU.
  *