*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2020, by the GROMACS development team, led by
+ * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_pp.h"
#include "gromacs/ewald/pme_pp_comm_gpu.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
#include "gromacs/mdlib/constr.h"
#include "gromacs/mdlib/enerdata_utils.h"
#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/force_flags.h"
#include "gromacs/mdlib/forcerec.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/qmmm.h"
#include "gromacs/mdlib/update.h"
+#include "gromacs/mdlib/vsite.h"
+#include "gromacs/mdlib/wholemoleculetransform.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/mdtypes/forcerec.h"
#include "gromacs/mdtypes/iforceprovider.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/mdtypes/state.h"
#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
#include "gromacs/pbcutil/ishift.h"
-#include "gromacs/pbcutil/mshift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/pulling/pull.h"
#include "gromacs/pulling/pull_rotation.h"
const rvec x[],
const gmx::ForceWithShiftForces& forceWithShiftForces,
tensor vir_part,
- const t_graph* graph,
const matrix box,
t_nrnb* nrnb,
const t_forcerec* fr,
- int ePBC)
+ PbcType pbcType)
{
/* The short-range virial from surrounding boxes */
const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
- calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, ePBC == epbcSCREW, box);
+ calc_vir(SHIFTS, fr->shift_vec, fshift, vir_part, pbcType == PbcType::Screw, box);
inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
/* Calculate partial virial, for local atoms only, based on short range.
* Total virial is computed in global_stat, called from do_md
*/
const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
- f_calc_vir(start, start + homenr, x, f, vir_part, graph, box);
+ f_calc_vir(start, start + homenr, x, f, vir_part, box);
inc_nrnb(nrnb, eNR_VIRIAL, homenr);
if (debug)
* which is why pull_potential is called close to other communication.
*/
wallcycle_start(wcycle, ewcPULLPOT);
- set_pbc(&pbc, ir->ePBC, box);
+ set_pbc(&pbc, ir->pbcType, box);
dvdl = 0;
enerd->term[F_COM_PULL] += pull_potential(pull_work, mdatoms, &pbc, cr, t, lambda[efptRESTRAINT],
as_rvec_array(x.data()), force, &dvdl);
enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+ for (auto& dhdl : enerd->dhdlLambda)
+ {
+ dhdl += dvdl;
+ }
wallcycle_stop(wcycle, ewcPULLPOT);
}
enerd->dvdl_lin[efptCOUL] += dvdl_q;
enerd->dvdl_lin[efptVDW] += dvdl_lj;
+ for (auto& dhdl : enerd->dhdlLambda)
+ {
+ dhdl += dvdl_q + dvdl_lj;
+ }
+
if (wcycle)
{
dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
ForceOutputs* forceOutputs,
tensor vir_force,
const t_mdatoms* mdatoms,
- const t_graph* graph,
const t_forcerec* fr,
const gmx_vsite_t* vsite,
const StepWorkload& stepWork)
*/
matrix virial = { { 0 } };
spread_vsite_f(vsite, x, fDirectVir, nullptr, stepWork.computeVirial, virial, nrnb,
- &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
+ top->idef, fr->pbcType, fr->bMolPBC, box, cr, wcycle);
forceWithVirial.addVirialContribution(virial);
}
if (awh)
{
enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
- inputrec->ePBC, *mdatoms, box, forceWithVirial, t, step, wcycle, fplog);
+ inputrec->pbcType, *mdatoms, box, forceWithVirial, t, step, wcycle, fplog);
}
}
}
}
-/*! \brief Makes PME flags from StepWorkload data.
- *
- * \param[in] stepWork Step schedule flags
- * \returns PME flags
- */
-static int makePmeFlags(const StepWorkload& stepWork)
-{
- return GMX_PME_SPREAD | GMX_PME_SOLVE | (stepWork.computeVirial ? GMX_PME_CALC_ENER_VIR : 0)
- | (stepWork.computeEnergy ? GMX_PME_CALC_ENER_VIR : 0)
- | (stepWork.computeForces ? GMX_PME_CALC_F : 0);
-}
-
/*! \brief Launch the prepare_step and spread stages of PME GPU.
*
* \param[in] pmedata The PME structure
* \param[in] box The box matrix
* \param[in] stepWork Step schedule flags
- * \param[in] pmeFlags PME flags
* \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in
* the device memory. \param[in] wcycle The wallcycle structure
*/
static inline void launchPmeGpuSpread(gmx_pme_t* pmedata,
const matrix box,
const StepWorkload& stepWork,
- int pmeFlags,
GpuEventSynchronizer* xReadyOnDevice,
gmx_wallcycle_t wcycle)
{
- pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags,
- stepWork.useGpuPmeFReduction);
+ pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
}
*
* \param[in] pmedata The PME structure
* \param[in] wcycle The wallcycle structure
+ * \param[in] stepWork Step schedule flags
*/
-static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle)
+static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle, const gmx::StepWorkload& stepWork)
{
- pme_gpu_launch_complex_transforms(pmedata, wcycle);
- pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
+ pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
+ pme_gpu_launch_gather(pmedata, wcycle);
}
/*! \brief
* \param[in,out] forceOutputs Output buffer for the forces and virial
* \param[in,out] enerd Energy data structure results are reduced into
* \param[in] stepWork Step schedule flags
- * \param[in] pmeFlags PME flags
* \param[in] wcycle The wallcycle structure
*/
static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
gmx::ForceOutputs* forceOutputs,
gmx_enerdata_t* enerd,
const StepWorkload& stepWork,
- int pmeFlags,
gmx_wallcycle_t wcycle)
{
bool isPmeGpuDone = false;
{
GpuTaskCompletion completionType =
(isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
- isPmeGpuDone = pme_gpu_try_finish_task(pmedata, pmeFlags, wcycle, &forceWithVirial,
+ isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle, &forceWithVirial,
enerd, completionType);
}
/*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
*/
-static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
- const t_forcerec& fr,
- const pull_t* pull_work,
- const gmx_edsam* ed,
- const t_idef& idef,
- const t_fcdata& fcd,
- const t_mdatoms& mdatoms,
+static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
+ const t_forcerec& fr,
+ const pull_t* pull_work,
+ const gmx_edsam* ed,
+ const InteractionDefinitions& idef,
+ const t_fcdata& fcd,
+ const t_mdatoms& mdatoms,
const SimulationWorkload& simulationWork,
const StepWorkload& stepWork)
{
/* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
*
- * TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
+ * TODO: eliminate \p useGpuPmeOnThisRank when this is
* incorporated in DomainLifetimeWorkload.
*/
static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv,
gmx_pme_t* pmedata,
gmx_enerdata_t* enerd,
const gmx::MdrunScheduleWorkload& runScheduleWork,
- bool useGpuNonbonded,
- bool useGpuPme,
+ bool useGpuPmeOnThisRank,
int64_t step,
gmx_wallcycle_t wcycle)
{
- if (useGpuNonbonded)
+ if (runScheduleWork.simulationWork.useGpuNonbonded)
{
/* Launch pruning before buffer clearing because the API overhead of the
* clear kernel launches can leave the GPU idle while it could be running
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- if (useGpuPme)
+ if (useGpuPmeOnThisRank)
{
pme_gpu_reinit_computation(pmedata, wcycle);
}
}
}
+//! \brief Data structure to hold dipole-related data and staging arrays
+struct DipoleData
+{
+ //! Dipole staging for fast summing over MPI
+ gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
+ //! Dipole staging for states A and B (index 0 and 1 resp.)
+ gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
+};
+
+
+static void reduceAndUpdateMuTot(DipoleData* dipoleData,
+ const t_commrec* cr,
+ const bool haveFreeEnergy,
+ gmx::ArrayRef<const real> lambda,
+ rvec muTotal,
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
+{
+ if (PAR(cr))
+ {
+ gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
+ ddBalanceRegionHandler.reopenRegionCpu();
+ }
+ for (int i = 0; i < 2; i++)
+ {
+ for (int j = 0; j < DIM; j++)
+ {
+ dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
+ }
+ }
+
+ if (!haveFreeEnergy)
+ {
+ copy_rvec(dipoleData->muStateAB[0], muTotal);
+ }
+ else
+ {
+ for (int j = 0; j < DIM; j++)
+ {
+ muTotal[j] = (1.0 - lambda[efptCOUL]) * dipoleData->muStateAB[0][j]
+ + lambda[efptCOUL] * dipoleData->muStateAB[1][j];
+ }
+ }
+}
void do_force(FILE* fplog,
const t_commrec* cr,
gmx_enerdata_t* enerd,
t_fcdata* fcd,
gmx::ArrayRef<real> lambda,
- t_graph* graph,
t_forcerec* fr,
gmx::MdrunScheduleWorkload* runScheduleWork,
const gmx_vsite_t* vsite,
- rvec mu_tot,
+ rvec muTotal,
double t,
gmx_edsam* ed,
int legacyFlags,
const DDBalanceRegionHandler& ddBalanceRegionHandler)
{
- int i, j;
- double mu[2 * DIM];
+ GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
+ "The size of the force buffer should be at least the number of atoms to compute "
+ "forces for");
+
nonbonded_verlet_t* nbv = fr->nbv.get();
interaction_const_t* ic = fr->ic;
gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
- // TODO remove the code below when the legacy flags are not in use anymore
- /* modify force flag if not doing nonbonded */
- if (!fr->bNonbonded)
- {
- legacyFlags &= ~GMX_FORCE_NONBONDED;
- }
-
const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
- const int pmeFlags = makePmeFlags(stepWork);
-
- // Switches on whether to use GPU for position and force buffer operations
- // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
- const BufferOpsUseGpu useGpuXBufOps =
- stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
- // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
- const BufferOpsUseGpu useGpuFBufOps =
- stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
}
- const int start = 0;
- const int homenr = mdatoms->homenr;
-
clear_mat(vir_force);
- if (stepWork.stateChanged)
- {
- if (inputrecNeedMutot(inputrec))
- {
- /* Calculate total (local) dipole moment in a temporary common array.
- * This makes it possible to sum them over nodes faster.
- */
- calc_mu(start, homenr, x.unpaddedArrayRef(), mdatoms->chargeA, mdatoms->chargeB,
- mdatoms->nChargePerturbed, mu, mu + DIM);
- }
- }
-
- if (fr->ePBC != epbcNONE)
+ if (fr->pbcType != PbcType::No)
{
/* Compute shift vectors every step,
* because of pressure coupling or box deformation!
const bool calcCGCM = (fillGrid && !DOMAINDECOMP(cr));
if (calcCGCM)
{
- put_atoms_in_box_omp(fr->ePBC, box, x.unpaddedArrayRef().subArray(0, homenr),
+ put_atoms_in_box_omp(fr->pbcType, box, x.unpaddedArrayRef().subArray(0, mdatoms->homenr),
gmx_omp_nthreads_get(emntDefault));
- inc_nrnb(nrnb, eNR_SHIFTX, homenr);
- }
- else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
- {
- unshift_self(graph, box, as_rvec_array(x.unpaddedArrayRef().data()));
+ inc_nrnb(nrnb, eNR_SHIFTX, mdatoms->homenr);
}
}
nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
-#if GMX_MPI
const bool pmeSendCoordinatesFromGpu =
- simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
+ GMX_MPI && simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
const bool reinitGpuPmePpComms =
- simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
-#endif
+ GMX_MPI && simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
- const auto localXReadyOnDevice = (stateGpu != nullptr)
+ const auto localXReadyOnDevice = (useGpuPmeOnThisRank || simulationWork.useGpuBufferOps)
? stateGpu->getCoordinatesReadyOnDeviceEvent(
AtomLocality::Local, simulationWork, stepWork)
: nullptr;
-#if GMX_MPI
// If coordinates are to be sent to PME task from CPU memory, perform that send here.
// Otherwise the send will occur after H2D coordinate transfer.
- if (!thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu)
+ if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu)
{
- /* Send particle coordinates to the pme nodes.
- * Since this is only implemented for domain decomposition
- * and domain decomposition does not use the graph,
- * we do not need to worry about shifting.
- */
+ /* Send particle coordinates to the pme nodes */
if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
{
GMX_RELEASE_ASSERT(false,
step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
}
-#endif /* GMX_MPI */
// Coordinates on the device are needed if PME or BufferOps are offloaded.
// The local coordinates can be copied right away.
// NOTE: Consider moving this copy to right after they are updated and constrained,
// if the later is not offloaded.
- if (useGpuPmeOnThisRank || useGpuXBufOps == BufferOpsUseGpu::True)
+ if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
{
if (stepWork.doNeighborSearch)
{
+ // TODO refactor this to do_md, after partitioning.
stateGpu->reinit(mdatoms->homenr,
cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
if (useGpuPmeOnThisRank)
// The conditions for gpuHaloExchange e.g. using GPU buffer
// operations were checked before construction, so here we can
// just use it and assert upon any conditions.
- gmx::GpuHaloExchange* gpuHaloExchange =
- (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
- const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
- GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
+ const bool ddUsesGpuDirectCommunication =
+ ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty()));
+ GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
"Must use coordinate buffer ops with GPU halo exchange");
- const bool useGpuForcesHaloExchange =
- ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+ const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
// Copy coordinate from the GPU if update is on the GPU and there
// are forces to be computed on the CPU, or for the computation of
haveCopiedXFromGpu = true;
}
-#if GMX_MPI
// If coordinates are to be sent to PME task from GPU memory, perform that send here.
// Otherwise the send will occur before the H2D coordinate transfer.
if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu)
{
- /* Send particle coordinates to the pme nodes.
- * Since this is only implemented for domain decomposition
- * and domain decomposition does not use the graph,
- * we do not need to worry about shifting.
- */
+ /* Send particle coordinates to the pme nodes */
gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
pmeSendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
}
-#endif /* GMX_MPI */
if (useGpuPmeOnThisRank)
{
- launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, localXReadyOnDevice, wcycle);
+ launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, wcycle);
}
/* do gridding for pair search */
if (stepWork.doNeighborSearch)
{
- if (graph && stepWork.stateChanged)
+ if (fr->wholeMoleculeTransform && stepWork.stateChanged)
{
- /* Calculate intramolecular shift vectors to make molecules whole */
- mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
+ fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
}
// TODO
}
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- }
- if (stepWork.doNeighborSearch)
- {
// Need to run after the GPU-offload bonded interaction lists
// are set up to be able to determine whether there is bonded work.
runScheduleWork->domainWork = setupDomainLifetimeWorkload(
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
- nbv->constructPairlist(InteractionLocality::Local, &top->excls, step, nrnb);
+ nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
wallcycle_stop(wcycle, ewcNS);
- if (useGpuXBufOps == BufferOpsUseGpu::True)
+ if (stepWork.useGpuXBufferOps)
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
// For force buffer ops, we use the below conditon rather than
- // useGpuFBufOps to ensure that init is performed even if this
+ // useGpuFBufferOps to ensure that init is performed even if this
// NS step is also a virial step (on which f buf ops are deactivated).
if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
{
}
else if (!EI_TPI(inputrec->eI))
{
- if (useGpuXBufOps == BufferOpsUseGpu::True)
+ if (stepWork.useGpuXBufferOps)
{
GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
nbv->convertCoordinatesGpu(AtomLocality::Local, false, stateGpu->getCoordinates(),
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
- if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+ if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
}
if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
- fr->gpuBonded->launchKernel(fr, stepWork, box);
+ fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
}
// X copy/transform to allow overlap as well as after the GPU NB
// launch to avoid FFT launch overhead hijacking the CPU and delaying
// the nonbonded kernel.
- launchPmeGpuFftAndGather(fr->pmedata, wcycle);
+ launchPmeGpuFftAndGather(fr->pmedata, wcycle, stepWork);
}
/* Communicate coordinates and sum dipole if necessary +
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
- nbv->constructPairlist(InteractionLocality::NonLocal, &top->excls, step, nrnb);
+ nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
+ // TODO refactor this GPU halo exchange re-initialisation
+ // to location in do_md where GPU halo exchange is
+ // constructed at partitioning, after above stateGpu
+ // re-initialization has similarly been refactored
if (ddUsesGpuDirectCommunication)
{
- gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
+ reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
}
}
else
{
// The following must be called after local setCoordinates (which records an event
// when the coordinate data has been copied to the device).
- gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
+ communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
{
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
}
- if (useGpuXBufOps == BufferOpsUseGpu::True)
+ if (stepWork.useGpuXBufferOps)
{
- // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication)
{
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
{
wallcycle_start(wcycle, ewcLAUNCH_GPU);
- if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+ if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
if (domainWork.haveGpuBondedWork)
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
- fr->gpuBonded->launchKernel(fr, stepWork, box);
+ fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
}
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- if (stepWork.stateChanged && inputrecNeedMutot(inputrec))
+ gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
+ if (fr->wholeMoleculeTransform)
{
- if (PAR(cr))
- {
- gmx_sumd(2 * DIM, mu, cr);
+ xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
+ }
- ddBalanceRegionHandler.reopenRegionCpu();
- }
+ DipoleData dipoleData;
- for (i = 0; i < 2; i++)
- {
- for (j = 0; j < DIM; j++)
- {
- fr->mu_tot[i][j] = mu[i * DIM + j];
- }
- }
- }
- if (mdatoms->nChargePerturbed == 0)
- {
- copy_rvec(fr->mu_tot[0], mu_tot);
- }
- else
+ if (simulationWork.computeMuTot)
{
- for (j = 0; j < DIM; j++)
- {
- mu_tot[j] = (1.0 - lambda[efptCOUL]) * fr->mu_tot[0][j] + lambda[efptCOUL] * fr->mu_tot[1][j];
- }
+ const int start = 0;
+
+ /* Calculate total (local) dipole moment in a temporary common array.
+ * This makes it possible to sum them over nodes faster.
+ */
+ gmx::ArrayRef<const gmx::RVec> xRef =
+ (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
+ calc_mu(start, mdatoms->homenr, xRef, mdatoms->chargeA, mdatoms->chargeB,
+ mdatoms->nChargePerturbed, dipoleData.muStaging[0], dipoleData.muStaging[1]);
+
+ reduceAndUpdateMuTot(&dipoleData, cr, (fr->efep != efepNO), lambda, muTotal, ddBalanceRegionHandler);
}
/* Reset energies */
}
}
- /* update QMMMrec, if necessary */
- if (fr->bQMMM)
- {
- update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
- }
-
// TODO Force flags should include haveFreeEnergyWork for this domain
if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
{
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
}
/* Compute the bonded and non-bonded energies and optionally forces */
- do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
- fcd, box, lambda.data(), graph, fr->mu_tot, stepWork, ddBalanceRegionHandler);
+ do_force_lowlevel(fr, inputrec, top->idef, cr, ms, nrnb, wcycle, mdatoms, x, xWholeMolecules,
+ hist, &forceOut, enerd, fcd, box, lambda.data(),
+ as_rvec_array(dipoleData.muStateAB), stepWork, ddBalanceRegionHandler);
wallcycle_stop(wcycle, ewcFORCE);
wallcycle_stop(wcycle, ewcFORCE);
}
- if (useGpuFBufOps == BufferOpsUseGpu::True)
+ if (stepWork.useGpuFBufferOps)
{
gmx::FixedCapacityVector<GpuEventSynchronizer*, 1> dependencyList;
stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
AtomLocality::NonLocal);
dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
- AtomLocality::NonLocal, useGpuFBufOps == BufferOpsUseGpu::True));
+ AtomLocality::NonLocal, stepWork.useGpuFBufferOps));
}
nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(),
{
stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
}
- gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
+ communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
}
else
{
- if (useGpuFBufOps == BufferOpsUseGpu::True)
+ if (stepWork.useGpuFBufferOps)
{
stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
}
// With both nonbonded and PME offloaded a GPU on the same rank, we use
// an alternating wait/reduction scheme.
bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
- && !DOMAINDECOMP(cr) && (useGpuFBufOps == BufferOpsUseGpu::False));
+ && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
if (alternateGpuWait)
{
- alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, pmeFlags, wcycle);
+ alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, wcycle);
}
if (!alternateGpuWait && useGpuPmeOnThisRank)
{
- pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd);
+ pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd);
}
/* Wait for local GPU NB outputs on the non-alternating wait path */
gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
- if (useGpuFBufOps == BufferOpsUseGpu::True)
+ if (stepWork.useGpuFBufferOps)
{
// Flag to specify whether the CPU force buffer has contributions to
// local atoms. This depends on whether there are CPU-based force tasks
auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
stateGpu->copyForcesToGpu(forceWithShift, locality);
- dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
- locality, useGpuFBufOps == BufferOpsUseGpu::True));
+ dependencyList.push_back(
+ stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps));
}
if (useGpuForcesHaloExchange)
{
- dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
+ dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent());
}
nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
dependencyList, stepWork.useGpuPmeFReduction,
}
launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
- simulationWork.useGpuNonbonded, useGpuPmeOnThisRank, step, wcycle);
+ useGpuPmeOnThisRank, step, wcycle);
if (DOMAINDECOMP(cr))
{
{
rvec* fshift = as_rvec_array(forceOut.forceWithShiftForces().shiftForces().data());
spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fshift, FALSE,
- nullptr, nrnb, &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
+ nullptr, nrnb, top->idef, fr->pbcType, fr->bMolPBC, box, cr, wcycle);
}
if (stepWork.computeVirial)
{
/* Calculation of the virial must be done after vsites! */
calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()),
- forceOut.forceWithShiftForces(), vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ forceOut.forceWithShiftForces(), vir_force, box, nrnb, fr, inputrec->pbcType);
}
}
if (stepWork.computeForces)
{
post_process_forces(cr, step, nrnb, wcycle, top, box, as_rvec_array(x.unpaddedArrayRef().data()),
- &forceOut, vir_force, mdatoms, graph, fr, vsite, stepWork);
+ &forceOut, vir_force, mdatoms, fr, vsite, stepWork);
}
if (stepWork.computeEnergy)