691cee18dc509d4996d05006037097794104d6b2
[alexxy/gromacs.git] / src / gromacs / ewald / pme_pp.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 /*! \internal \file
39  *
40  * \brief This file contains function definitions necessary for
41  * managing the offload of long-ranged PME work to separate MPI rank,
42  * for computing energies and forces (Coulomb and LJ).
43  *
44  * \author Berk Hess <hess@kth.se>
45  * \ingroup module_ewald
46  */
47
48 #include "gmxpre.h"
49
50 #include "config.h"
51
52 #include <cstdio>
53 #include <cstring>
54
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/domdec/domdec_struct.h"
57 #include "gromacs/ewald/pme.h"
58 #include "gromacs/ewald/pme_pp_comm_gpu.h"
59 #include "gromacs/gmxlib/network.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/gmx_omp_nthreads.h"
62 #include "gromacs/mdtypes/commrec.h"
63 #include "gromacs/mdtypes/forceoutput.h"
64 #include "gromacs/mdtypes/forcerec.h"
65 #include "gromacs/mdtypes/interaction_const.h"
66 #include "gromacs/mdtypes/md_enums.h"
67 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
68 #include "gromacs/nbnxm/nbnxm.h"
69 #include "gromacs/timing/wallcycle.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/gmxmpi.h"
72 #include "gromacs/utility/smalloc.h"
73
74 #include "pme_internal.h"
75 #include "pme_pp_communication.h"
76
77 /*! \brief Block to wait for communication to PME ranks to complete
78  *
79  * This should be faster with a real non-blocking MPI implementation
80  */
81 static constexpr bool c_useDelayedWait = false;
82
83 /*! \brief Wait for the pending data send requests to PME ranks to complete */
84 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
85 {
86     if (dd->nreq_pme)
87     {
88 #if GMX_MPI
89         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
90 #endif
91         dd->nreq_pme = 0;
92     }
93 }
94
95 /*! \brief Send data to PME ranks */
96 static void gmx_pme_send_coeffs_coords(t_forcerec*      fr,
97                                        const t_commrec* cr,
98                                        unsigned int     flags,
99                                        real gmx_unused* chargeA,
100                                        real gmx_unused* chargeB,
101                                        real gmx_unused* c6A,
102                                        real gmx_unused* c6B,
103                                        real gmx_unused* sigmaA,
104                                        real gmx_unused* sigmaB,
105                                        const matrix     box,
106                                        const rvec gmx_unused* x,
107                                        real                   lambda_q,
108                                        real                   lambda_lj,
109                                        int                    maxshift_x,
110                                        int                    maxshift_y,
111                                        int64_t                step,
112                                        bool                   useGpuPmePpComms,
113                                        bool                   reinitGpuPmePpComms,
114                                        bool                   sendCoordinatesFromGpu,
115                                        GpuEventSynchronizer*  coordinatesReadyOnDeviceEvent)
116 {
117     gmx_domdec_t*         dd;
118     gmx_pme_comm_n_box_t* cnb;
119     int                   n;
120
121     dd = cr->dd;
122     n  = dd_numHomeAtoms(*dd);
123
124     if (debug)
125     {
126         fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n", cr->sim_nodeid, dd->pme_nodeid,
127                 n, (flags & PP_PME_CHARGE) ? " charges" : "", (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
128                 (flags & PP_PME_SIGMA) ? " sigma" : "", (flags & PP_PME_COORD) ? " coordinates" : "");
129     }
130
131     if (useGpuPmePpComms)
132     {
133         flags |= PP_PME_GPUCOMMS;
134     }
135
136     if (c_useDelayedWait)
137     {
138         /* We can not use cnb until pending communication has finished */
139         gmx_pme_send_coeffs_coords_wait(dd);
140     }
141
142     if (dd->pme_receive_vir_ener)
143     {
144         /* Peer PP node: communicate all data */
145         if (dd->cnb == nullptr)
146         {
147             snew(dd->cnb, 1);
148         }
149         cnb = dd->cnb;
150
151         cnb->flags      = flags;
152         cnb->natoms     = n;
153         cnb->maxshift_x = maxshift_x;
154         cnb->maxshift_y = maxshift_y;
155         cnb->lambda_q   = lambda_q;
156         cnb->lambda_lj  = lambda_lj;
157         cnb->step       = step;
158         if (flags & PP_PME_COORD)
159         {
160             copy_mat(box, cnb->box);
161         }
162 #if GMX_MPI
163         MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
164                   &dd->req_pme[dd->nreq_pme++]);
165 #endif
166     }
167     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
168     {
169 #if GMX_MPI
170         /* Communicate only the number of atoms */
171         MPI_Isend(&n, sizeof(n), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
172                   &dd->req_pme[dd->nreq_pme++]);
173 #endif
174     }
175
176 #if GMX_MPI
177     if (n > 0)
178     {
179         if (flags & PP_PME_CHARGE)
180         {
181             MPI_Isend(chargeA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeA,
182                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
183         }
184         if (flags & PP_PME_CHARGEB)
185         {
186             MPI_Isend(chargeB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeB,
187                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
188         }
189         if (flags & PP_PME_SQRTC6)
190         {
191             MPI_Isend(c6A, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6A,
192                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
193         }
194         if (flags & PP_PME_SQRTC6B)
195         {
196             MPI_Isend(c6B, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6B,
197                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
198         }
199         if (flags & PP_PME_SIGMA)
200         {
201             MPI_Isend(sigmaA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaA,
202                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
203         }
204         if (flags & PP_PME_SIGMAB)
205         {
206             MPI_Isend(sigmaB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaB,
207                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
208         }
209         if (flags & PP_PME_COORD)
210         {
211             if (reinitGpuPmePpComms)
212             {
213                 fr->pmePpCommGpu->reinit(n);
214             }
215
216
217             /* MPI_Isend does not accept a const buffer pointer */
218             real* xRealPtr = const_cast<real*>(x[0]);
219             if (useGpuPmePpComms && (fr != nullptr))
220             {
221                 void* sendPtr = sendCoordinatesFromGpu
222                                         ? static_cast<void*>(fr->stateGpu->getCoordinates())
223                                         : static_cast<void*>(xRealPtr);
224                 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu,
225                                                                  coordinatesReadyOnDeviceEvent);
226             }
227             else
228             {
229                 MPI_Isend(xRealPtr, n * sizeof(rvec), MPI_BYTE, dd->pme_nodeid, eCommType_COORD,
230                           cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
231             }
232         }
233     }
234 #else
235     GMX_UNUSED_VALUE(fr);
236     GMX_UNUSED_VALUE(reinitGpuPmePpComms);
237     GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
238     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
239 #endif
240     if (!c_useDelayedWait)
241     {
242         /* Wait for the data to arrive */
243         /* We can skip this wait as we are sure x and q will not be modified
244          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
245          */
246         gmx_pme_send_coeffs_coords_wait(dd);
247     }
248 }
249
250 void gmx_pme_send_parameters(const t_commrec*           cr,
251                              const interaction_const_t* ic,
252                              gmx_bool                   bFreeEnergy_q,
253                              gmx_bool                   bFreeEnergy_lj,
254                              real*                      chargeA,
255                              real*                      chargeB,
256                              real*                      sqrt_c6A,
257                              real*                      sqrt_c6B,
258                              real*                      sigmaA,
259                              real*                      sigmaB,
260                              int                        maxshift_x,
261                              int                        maxshift_y)
262 {
263     unsigned int flags = 0;
264
265     if (EEL_PME(ic->eeltype))
266     {
267         flags |= PP_PME_CHARGE;
268     }
269     if (EVDW_PME(ic->vdwtype))
270     {
271         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
272     }
273     if (bFreeEnergy_q || bFreeEnergy_lj)
274     {
275         /* Assumes that the B state flags are in the bits just above
276          * the ones for the A state. */
277         flags |= (flags << 1);
278     }
279
280     gmx_pme_send_coeffs_coords(nullptr, cr, flags, chargeA, chargeB, sqrt_c6A, sqrt_c6B, sigmaA,
281                                sigmaB, nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false,
282                                false, false, nullptr);
283 }
284
285 void gmx_pme_send_coordinates(t_forcerec*           fr,
286                               const t_commrec*      cr,
287                               const matrix          box,
288                               const rvec*           x,
289                               real                  lambda_q,
290                               real                  lambda_lj,
291                               gmx_bool              bEnerVir,
292                               int64_t               step,
293                               bool                  useGpuPmePpComms,
294                               bool                  receiveCoordinateAddressFromPme,
295                               bool                  sendCoordinatesFromGpu,
296                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
297                               gmx_wallcycle*        wcycle)
298 {
299     wallcycle_start(wcycle, ewcPP_PMESENDX);
300
301     unsigned int flags = PP_PME_COORD;
302     if (bEnerVir)
303     {
304         flags |= PP_PME_ENER_VIR;
305     }
306     gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
307                                box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms,
308                                receiveCoordinateAddressFromPme, sendCoordinatesFromGpu,
309                                coordinatesReadyOnDeviceEvent);
310
311     wallcycle_stop(wcycle, ewcPP_PMESENDX);
312 }
313
314 void gmx_pme_send_finish(const t_commrec* cr)
315 {
316     unsigned int flags = PP_PME_FINISH;
317
318     gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
319                                nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
320 }
321
322 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
323 {
324 #if GMX_MPI
325     gmx_pme_comm_n_box_t cnb;
326
327     /* Only let one PP node signal each PME node */
328     if (cr->dd->pme_receive_vir_ener)
329     {
330         cnb.flags = PP_PME_SWITCHGRID;
331         copy_ivec(grid_size, cnb.grid_size);
332         cnb.ewaldcoeff_q  = ewaldcoeff_q;
333         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
334
335         /* We send this, uncommon, message blocking to simplify the code */
336         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
337     }
338 #else
339     GMX_UNUSED_VALUE(cr);
340     GMX_UNUSED_VALUE(grid_size);
341     GMX_UNUSED_VALUE(ewaldcoeff_q);
342     GMX_UNUSED_VALUE(ewaldcoeff_lj);
343 #endif
344 }
345
346 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
347 {
348 #if GMX_MPI
349     gmx_pme_comm_n_box_t cnb;
350
351     /* Only let one PP node signal each PME node */
352     if (cr->dd->pme_receive_vir_ener)
353     {
354         cnb.flags = PP_PME_RESETCOUNTERS;
355         cnb.step  = step;
356
357         /* We send this, uncommon, message blocking to simplify the code */
358         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
359     }
360 #endif
361 }
362
363 /*! \brief Receive virial and energy from PME rank */
364 static void receive_virial_energy(const t_commrec*      cr,
365                                   gmx::ForceWithVirial* forceWithVirial,
366                                   real*                 energy_q,
367                                   real*                 energy_lj,
368                                   real*                 dvdlambda_q,
369                                   real*                 dvdlambda_lj,
370                                   float*                pme_cycles)
371 {
372     gmx_pme_comm_vir_ene_t cve;
373
374     if (cr->dd->pme_receive_vir_ener)
375     {
376         if (debug)
377         {
378             fprintf(debug, "PP rank %d receiving from PME rank %d: virial and energy\n",
379                     cr->sim_nodeid, cr->dd->pme_nodeid);
380         }
381 #if GMX_MPI
382         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
383 #else
384         memset(&cve, 0, sizeof(cve));
385 #endif
386
387         forceWithVirial->addVirialContribution(cve.vir_q);
388         forceWithVirial->addVirialContribution(cve.vir_lj);
389         *energy_q  = cve.energy_q;
390         *energy_lj = cve.energy_lj;
391         *dvdlambda_q += cve.dvdlambda_q;
392         *dvdlambda_lj += cve.dvdlambda_lj;
393         *pme_cycles = cve.cycles;
394
395         if (cve.stop_cond != gmx_stop_cond_none)
396         {
397             gmx_set_stop_condition(cve.stop_cond);
398         }
399     }
400     else
401     {
402         *energy_q   = 0;
403         *energy_lj  = 0;
404         *pme_cycles = 0;
405     }
406 }
407
408 /*! \brief Recieve force data from PME ranks */
409 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
410                          void*              recvptr,
411                          int                n,
412                          const t_commrec*   cr,
413                          bool               useGpuPmePpComms,
414                          bool               receivePmeForceToGpu)
415 {
416     if (useGpuPmePpComms)
417     {
418         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
419         // Receive directly using CUDA memory copy
420         pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
421     }
422     else
423     {
424         // Receive data using MPI
425 #if GMX_MPI
426         MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
427                  MPI_STATUS_IGNORE);
428 #else
429         GMX_UNUSED_VALUE(cr);
430 #endif
431     }
432 }
433
434
435 void gmx_pme_receive_f(gmx::PmePpCommGpu*    pmePpCommGpu,
436                        const t_commrec*      cr,
437                        gmx::ForceWithVirial* forceWithVirial,
438                        real*                 energy_q,
439                        real*                 energy_lj,
440                        real*                 dvdlambda_q,
441                        real*                 dvdlambda_lj,
442                        bool                  useGpuPmePpComms,
443                        bool                  receivePmeForceToGpu,
444                        float*                pme_cycles)
445 {
446     if (c_useDelayedWait)
447     {
448         /* Wait for the x request to finish */
449         gmx_pme_send_coeffs_coords_wait(cr->dd);
450     }
451
452     const int               natoms = dd_numHomeAtoms(*cr->dd);
453     std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
454     buffer.resize(natoms);
455
456     void* recvptr = reinterpret_cast<void*>(buffer.data());
457     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
458
459     int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
460
461     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
462
463     if (!receivePmeForceToGpu)
464     {
465         /* Note that we would like to avoid this conditional by putting it
466          * into the omp pragma instead, but then we still take the full
467          * omp parallel for overhead (at least with gcc5).
468          */
469         if (nt == 1)
470         {
471             for (int i = 0; i < natoms; i++)
472             {
473                 f[i] += buffer[i];
474             }
475         }
476         else
477         {
478 #pragma omp parallel for num_threads(nt) schedule(static)
479             for (int i = 0; i < natoms; i++)
480             {
481                 f[i] += buffer[i];
482             }
483         }
484     }
485
486     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
487 }