From f310be38d375b49f4312c6ee0bd6cd62729174cf Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Mon, 14 Oct 2019 19:19:03 +0200 Subject: [PATCH] Trigger synchronizer when local forces are ready The sycnhronizer is created and managed in StatePropagatorDataGpu and is passed to the nonbonded mdoule at the f buffer ops init. Refs #2888 #3126 Change-Id: Ie9bf0b6cd8511fe282e377e48f3940e591db214c --- src/gromacs/mdlib/sim_util.cpp | 4 +++- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 5 +++++ src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 10 +++++++--- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 6 ++++++ src/gromacs/nbnxm/nbnxm.cpp | 5 +++-- src/gromacs/nbnxm/nbnxm.h | 7 +++++-- src/gromacs/nbnxm/nbnxm_gpu.h | 7 ++++--- 7 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 2f2cade7db..5c40ff7f16 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1155,7 +1155,8 @@ void do_force(FILE *fplog, // NS step is also a virial step (on which f buf ops are deactivated). if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) { - nbv->atomdata_init_add_nbat_f_to_f_gpu(); + GMX_ASSERT(stateGpu, "stateGpu should be valid here"); + nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice()); } } else if (!EI_TPI(inputrec->eI)) @@ -1544,6 +1545,7 @@ void do_force(FILE *fplog, pme_gpu_get_device_f(fr->pmedata), dependencyList, false, haveNonLocalForceContribInCpuBuffer); + // TODO: this should be conditional on whether GPU direct comm is used? stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } else diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 7b1241e5e2..201a97055f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -879,6 +879,11 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomL launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs); + if (atomLocality == AtomLocality::Local) + { + GMX_ASSERT(nb->localFReductionDone != nullptr, "localFReductionDone has to be a valid pointer"); + nb->localFReductionDone->markEvent(stream); + } } void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 71e25d23b1..67e7b581e7 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -963,13 +963,17 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, } /* Initialization for F buffer operations on GPU. */ -void nbnxn_gpu_init_add_nbat_f_to_f(const int *cell, - gmx_nbnxn_gpu_t *gpu_nbv, - int natoms_total) +void nbnxn_gpu_init_add_nbat_f_to_f(const int *cell, + gmx_nbnxn_gpu_t *gpu_nbv, + int natoms_total, + GpuEventSynchronizer* const localReductionDone) { cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local]; + GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer"); + gpu_nbv->localFReductionDone = localReductionDone; + if (natoms_total > 0) { reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index f3fc0e8852..4667d63c42 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -274,6 +274,12 @@ struct gmx_nbnxn_cuda_t any dependent task (e.g. transfer of coordinates to the PME rank's GPU) can proceed. */ + /*! \brief Pointer to event synchronizer triggered when the local GPU buffer ops / reduction is complete + * + * \note That the synchronizer is managed outside of this module in StatePropagatorDataGpu. + */ + GpuEventSynchronizer *localFReductionDone; + GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when non-local coordinate buffer has been copied from device to host*/ diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 3a796b504e..52d2fecb13 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -219,7 +219,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality } void -nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu() +nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone) { wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); @@ -229,7 +229,8 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu() Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(), gpu_nbv, - gridSet.numRealAtomsTotal()); + gridSet.numRealAtomsTotal(), + localReductionDone); wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 4378663b93..9e58003db2 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -359,8 +359,11 @@ struct nonbonded_verlet_t bool useGpuFPmeReduction, bool accumulateForce); - /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ - void atomdata_init_add_nbat_f_to_f_gpu(); + /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. + * + * \param localReductionDone Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel. + */ + void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone); /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */ void* get_x_on_device_event(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 1061292b2d..b0941b46b0 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -290,9 +290,10 @@ bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t gmx_unused *nb, /*! \brief Initialization for F buffer operations on GPU */ CUDA_FUNC_QUALIFIER -void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell, - gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - int gmx_unused natoms_total) CUDA_FUNC_TERM; +void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell, + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + int gmx_unused natoms_total, + GpuEventSynchronizer gmx_unused *localReductionDone) CUDA_FUNC_TERM; /*! \brief Force buffer operations on GPU. * -- 2.22.0