2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 * \brief This file contains function definitions necessary for
40 * managing the offload of long-ranged PME work to separate MPI rank,
41 * for computing energies and forces (Coulomb and LJ).
43 * \author Berk Hess <hess@kth.se>
44 * \ingroup module_ewald
54 #include "gromacs/domdec/domdec.h"
55 #include "gromacs/domdec/domdec_struct.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/ewald/pme_pp_comm_gpu.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/math/vec.h"
60 #include "gromacs/mdlib/gmx_omp_nthreads.h"
61 #include "gromacs/mdtypes/commrec.h"
62 #include "gromacs/mdtypes/forceoutput.h"
63 #include "gromacs/mdtypes/forcerec.h"
64 #include "gromacs/mdtypes/interaction_const.h"
65 #include "gromacs/mdtypes/md_enums.h"
66 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
67 #include "gromacs/nbnxm/nbnxm.h"
68 #include "gromacs/timing/wallcycle.h"
69 #include "gromacs/utility/fatalerror.h"
70 #include "gromacs/utility/gmxmpi.h"
71 #include "gromacs/utility/smalloc.h"
73 #include "pme_internal.h"
74 #include "pme_pp_communication.h"
76 /*! \brief Block to wait for communication to PME ranks to complete
78 * This should be faster with a real non-blocking MPI implementation
80 static constexpr bool c_useDelayedWait = false;
82 /*! \brief Wait for the pending data send requests to PME ranks to complete */
83 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t *dd)
88 MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
94 /*! \brief Send data to PME ranks */
95 static void gmx_pme_send_coeffs_coords(t_forcerec *fr, const t_commrec *cr, unsigned int flags,
96 real gmx_unused *chargeA, real gmx_unused *chargeB,
97 real gmx_unused *c6A, real gmx_unused *c6B,
98 real gmx_unused *sigmaA, real gmx_unused *sigmaB,
99 const matrix box, const rvec gmx_unused *x,
100 real lambda_q, real lambda_lj,
101 int maxshift_x, int maxshift_y,
102 int64_t step, bool useGpuPmePpComms,
103 bool reinitGpuPmePpComms,
104 bool sendCoordinatesFromGpu,
105 GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
108 gmx_pme_comm_n_box_t *cnb;
112 n = dd_numHomeAtoms(*dd);
116 fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n",
117 cr->sim_nodeid, dd->pme_nodeid, n,
118 (flags & PP_PME_CHARGE) ? " charges" : "",
119 (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
120 (flags & PP_PME_SIGMA) ? " sigma" : "",
121 (flags & PP_PME_COORD) ? " coordinates" : "");
124 if (useGpuPmePpComms)
126 flags |= PP_PME_GPUCOMMS;
129 if (c_useDelayedWait)
131 /* We can not use cnb until pending communication has finished */
132 gmx_pme_send_coeffs_coords_wait(dd);
135 if (dd->pme_receive_vir_ener)
137 /* Peer PP node: communicate all data */
138 if (dd->cnb == nullptr)
146 cnb->maxshift_x = maxshift_x;
147 cnb->maxshift_y = maxshift_y;
148 cnb->lambda_q = lambda_q;
149 cnb->lambda_lj = lambda_lj;
151 if (flags & PP_PME_COORD)
153 copy_mat(box, cnb->box);
156 MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE,
157 dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
158 &dd->req_pme[dd->nreq_pme++]);
161 else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
164 /* Communicate only the number of atoms */
165 MPI_Isend(&n, sizeof(n), MPI_BYTE,
166 dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
167 &dd->req_pme[dd->nreq_pme++]);
174 if (flags & PP_PME_CHARGE)
176 MPI_Isend(chargeA, n*sizeof(real), MPI_BYTE,
177 dd->pme_nodeid, eCommType_ChargeA, cr->mpi_comm_mysim,
178 &dd->req_pme[dd->nreq_pme++]);
180 if (flags & PP_PME_CHARGEB)
182 MPI_Isend(chargeB, n*sizeof(real), MPI_BYTE,
183 dd->pme_nodeid, eCommType_ChargeB, cr->mpi_comm_mysim,
184 &dd->req_pme[dd->nreq_pme++]);
186 if (flags & PP_PME_SQRTC6)
188 MPI_Isend(c6A, n*sizeof(real), MPI_BYTE,
189 dd->pme_nodeid, eCommType_SQRTC6A, cr->mpi_comm_mysim,
190 &dd->req_pme[dd->nreq_pme++]);
192 if (flags & PP_PME_SQRTC6B)
194 MPI_Isend(c6B, n*sizeof(real), MPI_BYTE,
195 dd->pme_nodeid, eCommType_SQRTC6B, cr->mpi_comm_mysim,
196 &dd->req_pme[dd->nreq_pme++]);
198 if (flags & PP_PME_SIGMA)
200 MPI_Isend(sigmaA, n*sizeof(real), MPI_BYTE,
201 dd->pme_nodeid, eCommType_SigmaA, cr->mpi_comm_mysim,
202 &dd->req_pme[dd->nreq_pme++]);
204 if (flags & PP_PME_SIGMAB)
206 MPI_Isend(sigmaB, n*sizeof(real), MPI_BYTE,
207 dd->pme_nodeid, eCommType_SigmaB, cr->mpi_comm_mysim,
208 &dd->req_pme[dd->nreq_pme++]);
210 if (flags & PP_PME_COORD)
212 if (reinitGpuPmePpComms)
214 fr->pmePpCommGpu->reinit(n);
218 /* MPI_Isend does not accept a const buffer pointer */
219 real *xRealPtr = const_cast<real *>(x[0]);
220 if (useGpuPmePpComms && (fr != nullptr))
222 void *sendPtr = sendCoordinatesFromGpu ? static_cast<void*> (fr->stateGpu->getCoordinates()) :
223 static_cast<void*> (xRealPtr);
224 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
228 MPI_Isend(xRealPtr, n*sizeof(rvec), MPI_BYTE,
229 dd->pme_nodeid, eCommType_COORD, cr->mpi_comm_mysim,
230 &dd->req_pme[dd->nreq_pme++]);
236 if (!c_useDelayedWait)
238 /* Wait for the data to arrive */
239 /* We can skip this wait as we are sure x and q will not be modified
240 * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
242 gmx_pme_send_coeffs_coords_wait(dd);
246 void gmx_pme_send_parameters(const t_commrec *cr,
247 const interaction_const_t *ic,
248 gmx_bool bFreeEnergy_q, gmx_bool bFreeEnergy_lj,
249 real *chargeA, real *chargeB,
250 real *sqrt_c6A, real *sqrt_c6B,
251 real *sigmaA, real *sigmaB,
252 int maxshift_x, int maxshift_y)
254 unsigned int flags = 0;
256 if (EEL_PME(ic->eeltype))
258 flags |= PP_PME_CHARGE;
260 if (EVDW_PME(ic->vdwtype))
262 flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
264 if (bFreeEnergy_q || bFreeEnergy_lj)
266 /* Assumes that the B state flags are in the bits just above
267 * the ones for the A state. */
268 flags |= (flags << 1);
271 gmx_pme_send_coeffs_coords(nullptr, cr, flags,
273 sqrt_c6A, sqrt_c6B, sigmaA, sigmaB,
274 nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false, nullptr);
277 void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix box, const rvec *x,
278 real lambda_q, real lambda_lj,
280 int64_t step, bool useGpuPmePpComms,
281 bool receiveCoordinateAddressFromPme,
282 bool sendCoordinatesFromGpu,
283 GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle)
285 wallcycle_start(wcycle, ewcPP_PMESENDX);
287 unsigned int flags = PP_PME_COORD;
290 flags |= PP_PME_ENER_VIR;
292 gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
293 box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms, receiveCoordinateAddressFromPme,
294 sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
296 wallcycle_stop(wcycle, ewcPP_PMESENDX);
299 void gmx_pme_send_finish(const t_commrec *cr)
301 unsigned int flags = PP_PME_FINISH;
303 gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
306 void gmx_pme_send_switchgrid(const t_commrec *cr,
312 gmx_pme_comm_n_box_t cnb;
314 /* Only let one PP node signal each PME node */
315 if (cr->dd->pme_receive_vir_ener)
317 cnb.flags = PP_PME_SWITCHGRID;
318 copy_ivec(grid_size, cnb.grid_size);
319 cnb.ewaldcoeff_q = ewaldcoeff_q;
320 cnb.ewaldcoeff_lj = ewaldcoeff_lj;
322 /* We send this, uncommon, message blocking to simplify the code */
323 MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
324 cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
327 GMX_UNUSED_VALUE(cr);
328 GMX_UNUSED_VALUE(grid_size);
329 GMX_UNUSED_VALUE(ewaldcoeff_q);
330 GMX_UNUSED_VALUE(ewaldcoeff_lj);
334 void gmx_pme_send_resetcounters(const t_commrec gmx_unused *cr, int64_t gmx_unused step)
337 gmx_pme_comm_n_box_t cnb;
339 /* Only let one PP node signal each PME node */
340 if (cr->dd->pme_receive_vir_ener)
342 cnb.flags = PP_PME_RESETCOUNTERS;
345 /* We send this, uncommon, message blocking to simplify the code */
346 MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
347 cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
352 /*! \brief Receive virial and energy from PME rank */
353 static void receive_virial_energy(const t_commrec *cr,
354 gmx::ForceWithVirial *forceWithVirial,
355 real *energy_q, real *energy_lj,
356 real *dvdlambda_q, real *dvdlambda_lj,
359 gmx_pme_comm_vir_ene_t cve;
361 if (cr->dd->pme_receive_vir_ener)
366 "PP rank %d receiving from PME rank %d: virial and energy\n",
367 cr->sim_nodeid, cr->dd->pme_nodeid);
370 MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim,
373 memset(&cve, 0, sizeof(cve));
376 forceWithVirial->addVirialContribution(cve.vir_q);
377 forceWithVirial->addVirialContribution(cve.vir_lj);
378 *energy_q = cve.energy_q;
379 *energy_lj = cve.energy_lj;
380 *dvdlambda_q += cve.dvdlambda_q;
381 *dvdlambda_lj += cve.dvdlambda_lj;
382 *pme_cycles = cve.cycles;
384 if (cve.stop_cond != gmx_stop_cond_none)
386 gmx_set_stop_condition(cve.stop_cond);
397 /*! \brief Recieve force data from PME ranks */
398 static void recvFFromPme(gmx::PmePpCommGpu *pmePpCommGpu,
402 bool useGpuPmePpComms,
403 bool receivePmeForceToGpu)
405 if (useGpuPmePpComms)
407 GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
408 //Receive directly using CUDA memory copy
409 pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
413 //Receive data using MPI
415 MPI_Recv(recvptr, n*sizeof(rvec), MPI_BYTE,
416 cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
423 void gmx_pme_receive_f(gmx::PmePpCommGpu *pmePpCommGpu,
425 gmx::ForceWithVirial *forceWithVirial,
426 real *energy_q, real *energy_lj,
427 real *dvdlambda_q, real *dvdlambda_lj,
428 bool useGpuPmePpComms, bool receivePmeForceToGpu,
431 if (c_useDelayedWait)
433 /* Wait for the x request to finish */
434 gmx_pme_send_coeffs_coords_wait(cr->dd);
437 const int natoms = dd_numHomeAtoms(*cr->dd);
438 std::vector<gmx::RVec> &buffer = cr->dd->pmeForceReceiveBuffer;
439 buffer.resize(natoms);
441 void *recvptr = reinterpret_cast<void*>(buffer.data());
442 recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
444 int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
446 gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
448 if (!receivePmeForceToGpu)
450 /* Note that we would like to avoid this conditional by putting it
451 * into the omp pragma instead, but then we still take the full
452 * omp parallel for overhead (at least with gcc5).
456 for (int i = 0; i < natoms; i++)
463 #pragma omp parallel for num_threads(nt) schedule(static)
464 for (int i = 0; i < natoms; i++)
471 receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);