e8433399a2235f99944df5b0e555d002042934a0
[alexxy/gromacs.git] / src / gromacs / ewald / pme_pp.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 /*! \internal \file
39  *
40  * \brief This file contains function definitions necessary for
41  * managing the offload of long-ranged PME work to separate MPI rank,
42  * for computing energies and forces (Coulomb and LJ).
43  *
44  * \author Berk Hess <hess@kth.se>
45  * \ingroup module_ewald
46  */
47
48 #include "gmxpre.h"
49
50 #include "pme_pp.h"
51
52 #include "config.h"
53
54 #include <cstdio>
55 #include <cstring>
56
57 #include "gromacs/domdec/domdec.h"
58 #include "gromacs/domdec/domdec_struct.h"
59 #include "gromacs/ewald/pme.h"
60 #include "gromacs/ewald/pme_pp_comm_gpu.h"
61 #include "gromacs/gmxlib/network.h"
62 #include "gromacs/math/vec.h"
63 #include "gromacs/mdlib/gmx_omp_nthreads.h"
64 #include "gromacs/mdtypes/commrec.h"
65 #include "gromacs/mdtypes/forceoutput.h"
66 #include "gromacs/mdtypes/forcerec.h"
67 #include "gromacs/mdtypes/interaction_const.h"
68 #include "gromacs/mdtypes/md_enums.h"
69 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
70 #include "gromacs/nbnxm/nbnxm.h"
71 #include "gromacs/timing/wallcycle.h"
72 #include "gromacs/utility/fatalerror.h"
73 #include "gromacs/utility/gmxmpi.h"
74 #include "gromacs/utility/smalloc.h"
75
76 #include "pme_pp_communication.h"
77
78 /*! \brief Block to wait for communication to PME ranks to complete
79  *
80  * This should be faster with a real non-blocking MPI implementation
81  */
82 static constexpr bool c_useDelayedWait = false;
83
84 /*! \brief Wait for the pending data send requests to PME ranks to complete */
85 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
86 {
87     if (dd->nreq_pme)
88     {
89 #if GMX_MPI
90         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
91 #endif
92         dd->nreq_pme = 0;
93     }
94 }
95
96 /*! \brief Send data to PME ranks */
97 static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
98                                        const t_commrec*               cr,
99                                        unsigned int                   flags,
100                                        gmx::ArrayRef<const real>      chargeA,
101                                        gmx::ArrayRef<const real>      chargeB,
102                                        gmx::ArrayRef<const real>      c6A,
103                                        gmx::ArrayRef<const real>      c6B,
104                                        gmx::ArrayRef<const real>      sigmaA,
105                                        gmx::ArrayRef<const real>      sigmaB,
106                                        const matrix                   box,
107                                        gmx::ArrayRef<const gmx::RVec> x,
108                                        real                           lambda_q,
109                                        real                           lambda_lj,
110                                        int                            maxshift_x,
111                                        int                            maxshift_y,
112                                        int64_t                        step,
113                                        bool                           useGpuPmePpComms,
114                                        bool                           reinitGpuPmePpComms,
115                                        bool                           sendCoordinatesFromGpu,
116                                        GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent)
117 {
118     gmx_domdec_t*         dd;
119     gmx_pme_comm_n_box_t* cnb;
120     int                   n;
121
122     dd = cr->dd;
123     n  = dd_numHomeAtoms(*dd);
124
125     if (debug)
126     {
127         fprintf(debug,
128                 "PP rank %d sending to PME rank %d: %d%s%s%s%s\n",
129                 cr->sim_nodeid,
130                 dd->pme_nodeid,
131                 n,
132                 (flags & PP_PME_CHARGE) ? " charges" : "",
133                 (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
134                 (flags & PP_PME_SIGMA) ? " sigma" : "",
135                 (flags & PP_PME_COORD) ? " coordinates" : "");
136     }
137
138     if (useGpuPmePpComms)
139     {
140         flags |= PP_PME_GPUCOMMS;
141     }
142
143     if (c_useDelayedWait)
144     {
145         /* We can not use cnb until pending communication has finished */
146         gmx_pme_send_coeffs_coords_wait(dd);
147     }
148
149     if (dd->pme_receive_vir_ener)
150     {
151         /* Peer PP node: communicate all data */
152         if (dd->cnb == nullptr)
153         {
154             snew(dd->cnb, 1);
155         }
156         cnb = dd->cnb;
157
158         cnb->flags      = flags;
159         cnb->natoms     = n;
160         cnb->maxshift_x = maxshift_x;
161         cnb->maxshift_y = maxshift_y;
162         cnb->lambda_q   = lambda_q;
163         cnb->lambda_lj  = lambda_lj;
164         cnb->step       = step;
165         if (flags & PP_PME_COORD)
166         {
167             copy_mat(box, cnb->box);
168         }
169 #if GMX_MPI
170         MPI_Isend(cnb,
171                   sizeof(*cnb),
172                   MPI_BYTE,
173                   dd->pme_nodeid,
174                   eCommType_CNB,
175                   cr->mpi_comm_mysim,
176                   &dd->req_pme[dd->nreq_pme++]);
177 #endif
178     }
179     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
180     {
181 #if GMX_MPI
182         /* Communicate only the number of atoms */
183         MPI_Isend(&n,
184                   sizeof(n),
185                   MPI_BYTE,
186                   dd->pme_nodeid,
187                   eCommType_CNB,
188                   cr->mpi_comm_mysim,
189                   &dd->req_pme[dd->nreq_pme++]);
190 #endif
191     }
192
193 #if GMX_MPI
194     if (n > 0)
195     {
196         if (flags & PP_PME_CHARGE)
197         {
198             MPI_Isend(chargeA.data(),
199                       n * sizeof(real),
200                       MPI_BYTE,
201                       dd->pme_nodeid,
202                       eCommType_ChargeA,
203                       cr->mpi_comm_mysim,
204                       &dd->req_pme[dd->nreq_pme++]);
205         }
206         if (flags & PP_PME_CHARGEB)
207         {
208             MPI_Isend(chargeB.data(),
209                       n * sizeof(real),
210                       MPI_BYTE,
211                       dd->pme_nodeid,
212                       eCommType_ChargeB,
213                       cr->mpi_comm_mysim,
214                       &dd->req_pme[dd->nreq_pme++]);
215         }
216         if (flags & PP_PME_SQRTC6)
217         {
218             MPI_Isend(c6A.data(),
219                       n * sizeof(real),
220                       MPI_BYTE,
221                       dd->pme_nodeid,
222                       eCommType_SQRTC6A,
223                       cr->mpi_comm_mysim,
224                       &dd->req_pme[dd->nreq_pme++]);
225         }
226         if (flags & PP_PME_SQRTC6B)
227         {
228             MPI_Isend(c6B.data(),
229                       n * sizeof(real),
230                       MPI_BYTE,
231                       dd->pme_nodeid,
232                       eCommType_SQRTC6B,
233                       cr->mpi_comm_mysim,
234                       &dd->req_pme[dd->nreq_pme++]);
235         }
236         if (flags & PP_PME_SIGMA)
237         {
238             MPI_Isend(sigmaA.data(),
239                       n * sizeof(real),
240                       MPI_BYTE,
241                       dd->pme_nodeid,
242                       eCommType_SigmaA,
243                       cr->mpi_comm_mysim,
244                       &dd->req_pme[dd->nreq_pme++]);
245         }
246         if (flags & PP_PME_SIGMAB)
247         {
248             MPI_Isend(sigmaB.data(),
249                       n * sizeof(real),
250                       MPI_BYTE,
251                       dd->pme_nodeid,
252                       eCommType_SigmaB,
253                       cr->mpi_comm_mysim,
254                       &dd->req_pme[dd->nreq_pme++]);
255         }
256         if (flags & PP_PME_COORD)
257         {
258             if (reinitGpuPmePpComms)
259             {
260                 fr->pmePpCommGpu->reinit(n);
261             }
262
263             if (useGpuPmePpComms && (fr != nullptr))
264             {
265                 if (sendCoordinatesFromGpu)
266                 {
267                     fr->pmePpCommGpu->sendCoordinatesToPmeFromGpu(
268                             fr->stateGpu->getCoordinates(), n, coordinatesReadyOnDeviceEvent);
269                 }
270                 else
271                 {
272                     fr->pmePpCommGpu->sendCoordinatesToPmeFromCpu(
273                             const_cast<gmx::RVec*>(x.data()), n, coordinatesReadyOnDeviceEvent);
274                 }
275             }
276             else
277             {
278                 MPI_Isend(x.data(),
279                           n * sizeof(rvec),
280                           MPI_BYTE,
281                           dd->pme_nodeid,
282                           eCommType_COORD,
283                           cr->mpi_comm_mysim,
284                           &dd->req_pme[dd->nreq_pme++]);
285             }
286         }
287     }
288 #else
289     GMX_UNUSED_VALUE(fr);
290     GMX_UNUSED_VALUE(chargeA);
291     GMX_UNUSED_VALUE(chargeB);
292     GMX_UNUSED_VALUE(c6A);
293     GMX_UNUSED_VALUE(c6B);
294     GMX_UNUSED_VALUE(sigmaA);
295     GMX_UNUSED_VALUE(sigmaB);
296     GMX_UNUSED_VALUE(x);
297     GMX_UNUSED_VALUE(reinitGpuPmePpComms);
298     GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
299     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
300 #endif
301     if (!c_useDelayedWait)
302     {
303         /* Wait for the data to arrive */
304         /* We can skip this wait as we are sure x and q will not be modified
305          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
306          */
307         gmx_pme_send_coeffs_coords_wait(dd);
308     }
309 }
310
311 void gmx_pme_send_parameters(const t_commrec*           cr,
312                              const interaction_const_t& interactionConst,
313                              bool                       bFreeEnergy_q,
314                              bool                       bFreeEnergy_lj,
315                              gmx::ArrayRef<const real>  chargeA,
316                              gmx::ArrayRef<const real>  chargeB,
317                              gmx::ArrayRef<const real>  sqrt_c6A,
318                              gmx::ArrayRef<const real>  sqrt_c6B,
319                              gmx::ArrayRef<const real>  sigmaA,
320                              gmx::ArrayRef<const real>  sigmaB,
321                              int                        maxshift_x,
322                              int                        maxshift_y)
323 {
324     unsigned int flags = 0;
325
326     if (EEL_PME(interactionConst.eeltype))
327     {
328         flags |= PP_PME_CHARGE;
329     }
330     if (EVDW_PME(interactionConst.vdwtype))
331     {
332         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
333     }
334     if (bFreeEnergy_q || bFreeEnergy_lj)
335     {
336         /* Assumes that the B state flags are in the bits just above
337          * the ones for the A state. */
338         flags |= (flags << 1);
339     }
340
341     gmx_pme_send_coeffs_coords(nullptr,
342                                cr,
343                                flags,
344                                chargeA,
345                                chargeB,
346                                sqrt_c6A,
347                                sqrt_c6B,
348                                sigmaA,
349                                sigmaB,
350                                nullptr,
351                                gmx::ArrayRef<gmx::RVec>(),
352                                0,
353                                0,
354                                maxshift_x,
355                                maxshift_y,
356                                -1,
357                                false,
358                                false,
359                                false,
360                                nullptr);
361 }
362
363 void gmx_pme_send_coordinates(t_forcerec*                    fr,
364                               const t_commrec*               cr,
365                               const matrix                   box,
366                               gmx::ArrayRef<const gmx::RVec> x,
367                               real                           lambda_q,
368                               real                           lambda_lj,
369                               bool                           computeEnergyAndVirial,
370                               int64_t                        step,
371                               bool                           useGpuPmePpComms,
372                               bool                           receiveCoordinateAddressFromPme,
373                               bool                           sendCoordinatesFromGpu,
374                               GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent,
375                               gmx_wallcycle*                 wcycle)
376 {
377     wallcycle_start(wcycle, WallCycleCounter::PpPmeSendX);
378
379     unsigned int flags = PP_PME_COORD;
380     if (computeEnergyAndVirial)
381     {
382         flags |= PP_PME_ENER_VIR;
383     }
384     gmx_pme_send_coeffs_coords(fr,
385                                cr,
386                                flags,
387                                {},
388                                {},
389                                {},
390                                {},
391                                {},
392                                {},
393                                box,
394                                x,
395                                lambda_q,
396                                lambda_lj,
397                                0,
398                                0,
399                                step,
400                                useGpuPmePpComms,
401                                receiveCoordinateAddressFromPme,
402                                sendCoordinatesFromGpu,
403                                coordinatesReadyOnDeviceEvent);
404
405     wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
406 }
407
408 void gmx_pme_send_finish(const t_commrec* cr)
409 {
410     unsigned int flags = PP_PME_FINISH;
411
412     gmx_pme_send_coeffs_coords(
413             nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, nullptr);
414 }
415
416 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
417 {
418 #if GMX_MPI
419     gmx_pme_comm_n_box_t cnb;
420
421     /* Only let one PP node signal each PME node */
422     if (cr->dd->pme_receive_vir_ener)
423     {
424         cnb.flags = PP_PME_SWITCHGRID;
425         copy_ivec(grid_size, cnb.grid_size);
426         cnb.ewaldcoeff_q  = ewaldcoeff_q;
427         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
428
429         /* We send this, uncommon, message blocking to simplify the code */
430         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
431     }
432 #else
433     GMX_UNUSED_VALUE(cr);
434     GMX_UNUSED_VALUE(grid_size);
435     GMX_UNUSED_VALUE(ewaldcoeff_q);
436     GMX_UNUSED_VALUE(ewaldcoeff_lj);
437 #endif
438 }
439
440 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
441 {
442 #if GMX_MPI
443     gmx_pme_comm_n_box_t cnb;
444
445     /* Only let one PP node signal each PME node */
446     if (cr->dd->pme_receive_vir_ener)
447     {
448         cnb.flags = PP_PME_RESETCOUNTERS;
449         cnb.step  = step;
450
451         /* We send this, uncommon, message blocking to simplify the code */
452         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
453     }
454 #endif
455 }
456
457 /*! \brief Receive virial and energy from PME rank */
458 static void receive_virial_energy(const t_commrec*      cr,
459                                   gmx::ForceWithVirial* forceWithVirial,
460                                   real*                 energy_q,
461                                   real*                 energy_lj,
462                                   real*                 dvdlambda_q,
463                                   real*                 dvdlambda_lj,
464                                   float*                pme_cycles)
465 {
466     gmx_pme_comm_vir_ene_t cve;
467
468     if (cr->dd->pme_receive_vir_ener)
469     {
470         if (debug)
471         {
472             fprintf(debug,
473                     "PP rank %d receiving from PME rank %d: virial and energy\n",
474                     cr->sim_nodeid,
475                     cr->dd->pme_nodeid);
476         }
477 #if GMX_MPI
478         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
479 #else
480         memset(&cve, 0, sizeof(cve));
481 #endif
482
483         forceWithVirial->addVirialContribution(cve.vir_q);
484         forceWithVirial->addVirialContribution(cve.vir_lj);
485         *energy_q  = cve.energy_q;
486         *energy_lj = cve.energy_lj;
487         *dvdlambda_q += cve.dvdlambda_q;
488         *dvdlambda_lj += cve.dvdlambda_lj;
489         *pme_cycles = cve.cycles;
490
491         if (cve.stop_cond != StopCondition::None)
492         {
493             gmx_set_stop_condition(cve.stop_cond);
494         }
495     }
496     else
497     {
498         *energy_q   = 0;
499         *energy_lj  = 0;
500         *pme_cycles = 0;
501     }
502 }
503
504 /*! \brief Recieve force data from PME ranks */
505 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
506                          void*              recvptr,
507                          int                n,
508                          const t_commrec*   cr,
509                          bool               useGpuPmePpComms,
510                          bool               receivePmeForceToGpu)
511 {
512     if (useGpuPmePpComms)
513     {
514         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
515         // Receive forces from PME rank
516         pmePpCommGpu->receiveForceFromPme(static_cast<gmx::RVec*>(recvptr), n, receivePmeForceToGpu);
517     }
518     else
519     {
520         // Receive data using MPI
521 #if GMX_MPI
522         MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
523 #else
524         GMX_UNUSED_VALUE(cr);
525         GMX_UNUSED_VALUE(n);
526 #endif
527     }
528 }
529
530
531 void gmx_pme_receive_f(gmx::PmePpCommGpu*    pmePpCommGpu,
532                        const t_commrec*      cr,
533                        gmx::ForceWithVirial* forceWithVirial,
534                        real*                 energy_q,
535                        real*                 energy_lj,
536                        real*                 dvdlambda_q,
537                        real*                 dvdlambda_lj,
538                        bool                  useGpuPmePpComms,
539                        bool                  receivePmeForceToGpu,
540                        float*                pme_cycles)
541 {
542     if (c_useDelayedWait)
543     {
544         /* Wait for the x request to finish */
545         gmx_pme_send_coeffs_coords_wait(cr->dd);
546     }
547
548     const int               natoms = dd_numHomeAtoms(*cr->dd);
549     std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
550     buffer.resize(natoms);
551
552     void* recvptr = reinterpret_cast<void*>(buffer.data());
553     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
554
555     int nt = gmx_omp_nthreads_get_simple_rvec_task(ModuleMultiThread::Default, natoms);
556
557     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
558
559     if (!receivePmeForceToGpu)
560     {
561         /* Note that we would like to avoid this conditional by putting it
562          * into the omp pragma instead, but then we still take the full
563          * omp parallel for overhead (at least with gcc5).
564          */
565         if (nt == 1)
566         {
567             for (int i = 0; i < natoms; i++)
568             {
569                 f[i] += buffer[i];
570             }
571         }
572         else
573         {
574 #pragma omp parallel for num_threads(nt) schedule(static)
575             for (int i = 0; i < natoms; i++)
576             {
577                 f[i] += buffer[i];
578             }
579         }
580     }
581
582     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
583 }