acd0a25ef74b5b7c75d4943c0248988a7965b1bd
[alexxy/gromacs.git] / src / gromacs / ewald / pme_pp.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief This file contains function definitions necessary for
40  * managing the offload of long-ranged PME work to separate MPI rank,
41  * for computing energies and forces (Coulomb and LJ).
42  *
43  * \author Berk Hess <hess@kth.se>
44  * \ingroup module_ewald
45  */
46
47 #include "gmxpre.h"
48
49 #include "config.h"
50
51 #include <cstdio>
52 #include <cstring>
53
54 #include "gromacs/domdec/domdec.h"
55 #include "gromacs/domdec/domdec_struct.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/ewald/pme_pp_comm_gpu.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/math/vec.h"
60 #include "gromacs/mdlib/gmx_omp_nthreads.h"
61 #include "gromacs/mdtypes/commrec.h"
62 #include "gromacs/mdtypes/forceoutput.h"
63 #include "gromacs/mdtypes/forcerec.h"
64 #include "gromacs/mdtypes/interaction_const.h"
65 #include "gromacs/mdtypes/md_enums.h"
66 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
67 #include "gromacs/nbnxm/nbnxm.h"
68 #include "gromacs/timing/wallcycle.h"
69 #include "gromacs/utility/fatalerror.h"
70 #include "gromacs/utility/gmxmpi.h"
71 #include "gromacs/utility/smalloc.h"
72
73 #include "pme_internal.h"
74 #include "pme_pp_communication.h"
75
76 /*! \brief Block to wait for communication to PME ranks to complete
77  *
78  * This should be faster with a real non-blocking MPI implementation
79  */
80 static constexpr bool c_useDelayedWait = false;
81
82 /*! \brief Wait for the pending data send requests to PME ranks to complete */
83 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
84 {
85     if (dd->nreq_pme)
86     {
87 #if GMX_MPI
88         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
89 #endif
90         dd->nreq_pme = 0;
91     }
92 }
93
94 /*! \brief Send data to PME ranks */
95 static void gmx_pme_send_coeffs_coords(t_forcerec*      fr,
96                                        const t_commrec* cr,
97                                        unsigned int     flags,
98                                        real gmx_unused* chargeA,
99                                        real gmx_unused* chargeB,
100                                        real gmx_unused* c6A,
101                                        real gmx_unused* c6B,
102                                        real gmx_unused* sigmaA,
103                                        real gmx_unused* sigmaB,
104                                        const matrix     box,
105                                        const rvec gmx_unused* x,
106                                        real                   lambda_q,
107                                        real                   lambda_lj,
108                                        int                    maxshift_x,
109                                        int                    maxshift_y,
110                                        int64_t                step,
111                                        bool                   useGpuPmePpComms,
112                                        bool                   reinitGpuPmePpComms,
113                                        bool                   sendCoordinatesFromGpu,
114                                        GpuEventSynchronizer*  coordinatesReadyOnDeviceEvent)
115 {
116     gmx_domdec_t*         dd;
117     gmx_pme_comm_n_box_t* cnb;
118     int                   n;
119
120     dd = cr->dd;
121     n  = dd_numHomeAtoms(*dd);
122
123     if (debug)
124     {
125         fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n", cr->sim_nodeid, dd->pme_nodeid,
126                 n, (flags & PP_PME_CHARGE) ? " charges" : "", (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
127                 (flags & PP_PME_SIGMA) ? " sigma" : "", (flags & PP_PME_COORD) ? " coordinates" : "");
128     }
129
130     if (useGpuPmePpComms)
131     {
132         flags |= PP_PME_GPUCOMMS;
133     }
134
135     if (c_useDelayedWait)
136     {
137         /* We can not use cnb until pending communication has finished */
138         gmx_pme_send_coeffs_coords_wait(dd);
139     }
140
141     if (dd->pme_receive_vir_ener)
142     {
143         /* Peer PP node: communicate all data */
144         if (dd->cnb == nullptr)
145         {
146             snew(dd->cnb, 1);
147         }
148         cnb = dd->cnb;
149
150         cnb->flags      = flags;
151         cnb->natoms     = n;
152         cnb->maxshift_x = maxshift_x;
153         cnb->maxshift_y = maxshift_y;
154         cnb->lambda_q   = lambda_q;
155         cnb->lambda_lj  = lambda_lj;
156         cnb->step       = step;
157         if (flags & PP_PME_COORD)
158         {
159             copy_mat(box, cnb->box);
160         }
161 #if GMX_MPI
162         MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
163                   &dd->req_pme[dd->nreq_pme++]);
164 #endif
165     }
166     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
167     {
168 #if GMX_MPI
169         /* Communicate only the number of atoms */
170         MPI_Isend(&n, sizeof(n), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
171                   &dd->req_pme[dd->nreq_pme++]);
172 #endif
173     }
174
175 #if GMX_MPI
176     if (n > 0)
177     {
178         if (flags & PP_PME_CHARGE)
179         {
180             MPI_Isend(chargeA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeA,
181                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
182         }
183         if (flags & PP_PME_CHARGEB)
184         {
185             MPI_Isend(chargeB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeB,
186                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
187         }
188         if (flags & PP_PME_SQRTC6)
189         {
190             MPI_Isend(c6A, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6A,
191                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
192         }
193         if (flags & PP_PME_SQRTC6B)
194         {
195             MPI_Isend(c6B, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6B,
196                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
197         }
198         if (flags & PP_PME_SIGMA)
199         {
200             MPI_Isend(sigmaA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaA,
201                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
202         }
203         if (flags & PP_PME_SIGMAB)
204         {
205             MPI_Isend(sigmaB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaB,
206                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
207         }
208         if (flags & PP_PME_COORD)
209         {
210             if (reinitGpuPmePpComms)
211             {
212                 fr->pmePpCommGpu->reinit(n);
213             }
214
215
216             /* MPI_Isend does not accept a const buffer pointer */
217             real* xRealPtr = const_cast<real*>(x[0]);
218             if (useGpuPmePpComms && (fr != nullptr))
219             {
220                 void* sendPtr = sendCoordinatesFromGpu
221                                         ? static_cast<void*>(fr->stateGpu->getCoordinates())
222                                         : static_cast<void*>(xRealPtr);
223                 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu,
224                                                                  coordinatesReadyOnDeviceEvent);
225             }
226             else
227             {
228                 MPI_Isend(xRealPtr, n * sizeof(rvec), MPI_BYTE, dd->pme_nodeid, eCommType_COORD,
229                           cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
230             }
231         }
232     }
233 #else
234     GMX_UNUSED_VALUE(fr);
235     GMX_UNUSED_VALUE(reinitGpuPmePpComms);
236     GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
237     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
238 #endif
239     if (!c_useDelayedWait)
240     {
241         /* Wait for the data to arrive */
242         /* We can skip this wait as we are sure x and q will not be modified
243          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
244          */
245         gmx_pme_send_coeffs_coords_wait(dd);
246     }
247 }
248
249 void gmx_pme_send_parameters(const t_commrec*           cr,
250                              const interaction_const_t* ic,
251                              gmx_bool                   bFreeEnergy_q,
252                              gmx_bool                   bFreeEnergy_lj,
253                              real*                      chargeA,
254                              real*                      chargeB,
255                              real*                      sqrt_c6A,
256                              real*                      sqrt_c6B,
257                              real*                      sigmaA,
258                              real*                      sigmaB,
259                              int                        maxshift_x,
260                              int                        maxshift_y)
261 {
262     unsigned int flags = 0;
263
264     if (EEL_PME(ic->eeltype))
265     {
266         flags |= PP_PME_CHARGE;
267     }
268     if (EVDW_PME(ic->vdwtype))
269     {
270         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
271     }
272     if (bFreeEnergy_q || bFreeEnergy_lj)
273     {
274         /* Assumes that the B state flags are in the bits just above
275          * the ones for the A state. */
276         flags |= (flags << 1);
277     }
278
279     gmx_pme_send_coeffs_coords(nullptr, cr, flags, chargeA, chargeB, sqrt_c6A, sqrt_c6B, sigmaA,
280                                sigmaB, nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false,
281                                false, false, nullptr);
282 }
283
284 void gmx_pme_send_coordinates(t_forcerec*           fr,
285                               const t_commrec*      cr,
286                               const matrix          box,
287                               const rvec*           x,
288                               real                  lambda_q,
289                               real                  lambda_lj,
290                               gmx_bool              bEnerVir,
291                               int64_t               step,
292                               bool                  useGpuPmePpComms,
293                               bool                  receiveCoordinateAddressFromPme,
294                               bool                  sendCoordinatesFromGpu,
295                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
296                               gmx_wallcycle*        wcycle)
297 {
298     wallcycle_start(wcycle, ewcPP_PMESENDX);
299
300     unsigned int flags = PP_PME_COORD;
301     if (bEnerVir)
302     {
303         flags |= PP_PME_ENER_VIR;
304     }
305     gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
306                                box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms,
307                                receiveCoordinateAddressFromPme, sendCoordinatesFromGpu,
308                                coordinatesReadyOnDeviceEvent);
309
310     wallcycle_stop(wcycle, ewcPP_PMESENDX);
311 }
312
313 void gmx_pme_send_finish(const t_commrec* cr)
314 {
315     unsigned int flags = PP_PME_FINISH;
316
317     gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
318                                nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
319 }
320
321 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
322 {
323 #if GMX_MPI
324     gmx_pme_comm_n_box_t cnb;
325
326     /* Only let one PP node signal each PME node */
327     if (cr->dd->pme_receive_vir_ener)
328     {
329         cnb.flags = PP_PME_SWITCHGRID;
330         copy_ivec(grid_size, cnb.grid_size);
331         cnb.ewaldcoeff_q  = ewaldcoeff_q;
332         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
333
334         /* We send this, uncommon, message blocking to simplify the code */
335         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
336     }
337 #else
338     GMX_UNUSED_VALUE(cr);
339     GMX_UNUSED_VALUE(grid_size);
340     GMX_UNUSED_VALUE(ewaldcoeff_q);
341     GMX_UNUSED_VALUE(ewaldcoeff_lj);
342 #endif
343 }
344
345 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
346 {
347 #if GMX_MPI
348     gmx_pme_comm_n_box_t cnb;
349
350     /* Only let one PP node signal each PME node */
351     if (cr->dd->pme_receive_vir_ener)
352     {
353         cnb.flags = PP_PME_RESETCOUNTERS;
354         cnb.step  = step;
355
356         /* We send this, uncommon, message blocking to simplify the code */
357         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
358     }
359 #endif
360 }
361
362 /*! \brief Receive virial and energy from PME rank */
363 static void receive_virial_energy(const t_commrec*      cr,
364                                   gmx::ForceWithVirial* forceWithVirial,
365                                   real*                 energy_q,
366                                   real*                 energy_lj,
367                                   real*                 dvdlambda_q,
368                                   real*                 dvdlambda_lj,
369                                   float*                pme_cycles)
370 {
371     gmx_pme_comm_vir_ene_t cve;
372
373     if (cr->dd->pme_receive_vir_ener)
374     {
375         if (debug)
376         {
377             fprintf(debug, "PP rank %d receiving from PME rank %d: virial and energy\n",
378                     cr->sim_nodeid, cr->dd->pme_nodeid);
379         }
380 #if GMX_MPI
381         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
382 #else
383         memset(&cve, 0, sizeof(cve));
384 #endif
385
386         forceWithVirial->addVirialContribution(cve.vir_q);
387         forceWithVirial->addVirialContribution(cve.vir_lj);
388         *energy_q  = cve.energy_q;
389         *energy_lj = cve.energy_lj;
390         *dvdlambda_q += cve.dvdlambda_q;
391         *dvdlambda_lj += cve.dvdlambda_lj;
392         *pme_cycles = cve.cycles;
393
394         if (cve.stop_cond != gmx_stop_cond_none)
395         {
396             gmx_set_stop_condition(cve.stop_cond);
397         }
398     }
399     else
400     {
401         *energy_q   = 0;
402         *energy_lj  = 0;
403         *pme_cycles = 0;
404     }
405 }
406
407 /*! \brief Recieve force data from PME ranks */
408 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
409                          void*              recvptr,
410                          int                n,
411                          const t_commrec*   cr,
412                          bool               useGpuPmePpComms,
413                          bool               receivePmeForceToGpu)
414 {
415     if (useGpuPmePpComms)
416     {
417         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
418         // Receive directly using CUDA memory copy
419         pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
420     }
421     else
422     {
423         // Receive data using MPI
424 #if GMX_MPI
425         MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
426                  MPI_STATUS_IGNORE);
427 #else
428         GMX_UNUSED_VALUE(cr);
429 #endif
430     }
431 }
432
433
434 void gmx_pme_receive_f(gmx::PmePpCommGpu*    pmePpCommGpu,
435                        const t_commrec*      cr,
436                        gmx::ForceWithVirial* forceWithVirial,
437                        real*                 energy_q,
438                        real*                 energy_lj,
439                        real*                 dvdlambda_q,
440                        real*                 dvdlambda_lj,
441                        bool                  useGpuPmePpComms,
442                        bool                  receivePmeForceToGpu,
443                        float*                pme_cycles)
444 {
445     if (c_useDelayedWait)
446     {
447         /* Wait for the x request to finish */
448         gmx_pme_send_coeffs_coords_wait(cr->dd);
449     }
450
451     const int               natoms = dd_numHomeAtoms(*cr->dd);
452     std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
453     buffer.resize(natoms);
454
455     void* recvptr = reinterpret_cast<void*>(buffer.data());
456     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
457
458     int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
459
460     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
461
462     if (!receivePmeForceToGpu)
463     {
464         /* Note that we would like to avoid this conditional by putting it
465          * into the omp pragma instead, but then we still take the full
466          * omp parallel for overhead (at least with gcc5).
467          */
468         if (nt == 1)
469         {
470             for (int i = 0; i < natoms; i++)
471             {
472                 f[i] += buffer[i];
473             }
474         }
475         else
476         {
477 #pragma omp parallel for num_threads(nt) schedule(static)
478             for (int i = 0; i < natoms; i++)
479             {
480                 f[i] += buffer[i];
481             }
482         }
483     }
484
485     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
486 }