SYCL: Avoid using no_init read accessor in rocFFT
[alexxy/gromacs.git] / src / gromacs / ewald / pme_pp.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 /*! \internal \file
39  *
40  * \brief This file contains function definitions necessary for
41  * managing the offload of long-ranged PME work to separate MPI rank,
42  * for computing energies and forces (Coulomb and LJ).
43  *
44  * \author Berk Hess <hess@kth.se>
45  * \ingroup module_ewald
46  */
47
48 #include "gmxpre.h"
49
50 #include "pme_pp.h"
51
52 #include "config.h"
53
54 #include <cstdio>
55 #include <cstring>
56
57 #include "gromacs/domdec/domdec.h"
58 #include "gromacs/domdec/domdec_struct.h"
59 #include "gromacs/ewald/pme.h"
60 #include "gromacs/ewald/pme_pp_comm_gpu.h"
61 #include "gromacs/gmxlib/network.h"
62 #include "gromacs/math/vec.h"
63 #include "gromacs/mdlib/gmx_omp_nthreads.h"
64 #include "gromacs/mdtypes/commrec.h"
65 #include "gromacs/mdtypes/forceoutput.h"
66 #include "gromacs/mdtypes/forcerec.h"
67 #include "gromacs/mdtypes/interaction_const.h"
68 #include "gromacs/mdtypes/md_enums.h"
69 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
70 #include "gromacs/nbnxm/nbnxm.h"
71 #include "gromacs/timing/wallcycle.h"
72 #include "gromacs/utility/fatalerror.h"
73 #include "gromacs/utility/gmxmpi.h"
74 #include "gromacs/utility/smalloc.h"
75
76 #include "pme_pp_communication.h"
77
78 /*! \brief Block to wait for communication to PME ranks to complete
79  *
80  * This should be faster with a real non-blocking MPI implementation
81  */
82 static constexpr bool c_useDelayedWait = false;
83
84 /*! \brief Wait for the pending data send requests to PME ranks to complete */
85 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
86 {
87     if (dd->nreq_pme)
88     {
89 #if GMX_MPI
90         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
91 #endif
92         dd->nreq_pme = 0;
93     }
94 }
95
96 /*! \brief Send data to PME ranks */
97 static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
98                                        const t_commrec*               cr,
99                                        unsigned int                   flags,
100                                        gmx::ArrayRef<const real>      chargeA,
101                                        gmx::ArrayRef<const real>      chargeB,
102                                        gmx::ArrayRef<const real>      c6A,
103                                        gmx::ArrayRef<const real>      c6B,
104                                        gmx::ArrayRef<const real>      sigmaA,
105                                        gmx::ArrayRef<const real>      sigmaB,
106                                        const matrix                   box,
107                                        gmx::ArrayRef<const gmx::RVec> x,
108                                        real                           lambda_q,
109                                        real                           lambda_lj,
110                                        int                            maxshift_x,
111                                        int                            maxshift_y,
112                                        int64_t                        step,
113                                        bool                           useGpuPmePpComms,
114                                        bool                           reinitGpuPmePpComms,
115                                        bool                           sendCoordinatesFromGpu,
116                                        bool                           receiveForcesToGpu,
117                                        GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent)
118 {
119     gmx_domdec_t*         dd;
120     gmx_pme_comm_n_box_t* cnb;
121     int                   n;
122
123     dd = cr->dd;
124     n  = dd_numHomeAtoms(*dd);
125
126     if (debug)
127     {
128         fprintf(debug,
129                 "PP rank %d sending to PME rank %d: %d%s%s%s%s\n",
130                 cr->sim_nodeid,
131                 dd->pme_nodeid,
132                 n,
133                 (flags & PP_PME_CHARGE) ? " charges" : "",
134                 (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
135                 (flags & PP_PME_SIGMA) ? " sigma" : "",
136                 (flags & PP_PME_COORD) ? " coordinates" : "");
137     }
138
139     if (useGpuPmePpComms)
140     {
141         flags |= PP_PME_GPUCOMMS;
142         if (receiveForcesToGpu)
143         {
144             flags |= PP_PME_RECVFTOGPU;
145         }
146     }
147
148     if (c_useDelayedWait)
149     {
150         /* We can not use cnb until pending communication has finished */
151         gmx_pme_send_coeffs_coords_wait(dd);
152     }
153
154     if (dd->pme_receive_vir_ener)
155     {
156         /* Peer PP node: communicate all data */
157         if (dd->cnb == nullptr)
158         {
159             snew(dd->cnb, 1);
160         }
161         cnb = dd->cnb;
162
163         cnb->flags      = flags;
164         cnb->natoms     = n;
165         cnb->maxshift_x = maxshift_x;
166         cnb->maxshift_y = maxshift_y;
167         cnb->lambda_q   = lambda_q;
168         cnb->lambda_lj  = lambda_lj;
169         cnb->step       = step;
170         if (flags & PP_PME_COORD)
171         {
172             copy_mat(box, cnb->box);
173         }
174 #if GMX_MPI
175         MPI_Isend(cnb,
176                   sizeof(*cnb),
177                   MPI_BYTE,
178                   dd->pme_nodeid,
179                   eCommType_CNB,
180                   cr->mpi_comm_mysim,
181                   &dd->req_pme[dd->nreq_pme++]);
182 #endif
183     }
184     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
185     {
186 #if GMX_MPI
187         /* Communicate only the number of atoms */
188         MPI_Isend(&n,
189                   sizeof(n),
190                   MPI_BYTE,
191                   dd->pme_nodeid,
192                   eCommType_CNB,
193                   cr->mpi_comm_mysim,
194                   &dd->req_pme[dd->nreq_pme++]);
195 #endif
196     }
197
198 #if GMX_MPI
199     if (n > 0)
200     {
201         if (flags & PP_PME_CHARGE)
202         {
203             MPI_Isend(chargeA.data(),
204                       n * sizeof(real),
205                       MPI_BYTE,
206                       dd->pme_nodeid,
207                       eCommType_ChargeA,
208                       cr->mpi_comm_mysim,
209                       &dd->req_pme[dd->nreq_pme++]);
210         }
211         if (flags & PP_PME_CHARGEB)
212         {
213             MPI_Isend(chargeB.data(),
214                       n * sizeof(real),
215                       MPI_BYTE,
216                       dd->pme_nodeid,
217                       eCommType_ChargeB,
218                       cr->mpi_comm_mysim,
219                       &dd->req_pme[dd->nreq_pme++]);
220         }
221         if (flags & PP_PME_SQRTC6)
222         {
223             MPI_Isend(c6A.data(),
224                       n * sizeof(real),
225                       MPI_BYTE,
226                       dd->pme_nodeid,
227                       eCommType_SQRTC6A,
228                       cr->mpi_comm_mysim,
229                       &dd->req_pme[dd->nreq_pme++]);
230         }
231         if (flags & PP_PME_SQRTC6B)
232         {
233             MPI_Isend(c6B.data(),
234                       n * sizeof(real),
235                       MPI_BYTE,
236                       dd->pme_nodeid,
237                       eCommType_SQRTC6B,
238                       cr->mpi_comm_mysim,
239                       &dd->req_pme[dd->nreq_pme++]);
240         }
241         if (flags & PP_PME_SIGMA)
242         {
243             MPI_Isend(sigmaA.data(),
244                       n * sizeof(real),
245                       MPI_BYTE,
246                       dd->pme_nodeid,
247                       eCommType_SigmaA,
248                       cr->mpi_comm_mysim,
249                       &dd->req_pme[dd->nreq_pme++]);
250         }
251         if (flags & PP_PME_SIGMAB)
252         {
253             MPI_Isend(sigmaB.data(),
254                       n * sizeof(real),
255                       MPI_BYTE,
256                       dd->pme_nodeid,
257                       eCommType_SigmaB,
258                       cr->mpi_comm_mysim,
259                       &dd->req_pme[dd->nreq_pme++]);
260         }
261         if (flags & PP_PME_COORD)
262         {
263             if (reinitGpuPmePpComms)
264             {
265                 std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
266                 buffer.resize(n);
267                 fr->pmePpCommGpu->reinit(n);
268             }
269
270             if (useGpuPmePpComms && (fr != nullptr))
271             {
272                 if (sendCoordinatesFromGpu)
273                 {
274                     fr->pmePpCommGpu->sendCoordinatesToPmeFromGpu(
275                             fr->stateGpu->getCoordinates(), n, coordinatesReadyOnDeviceEvent);
276                 }
277                 else
278                 {
279                     fr->pmePpCommGpu->sendCoordinatesToPmeFromCpu(
280                             const_cast<gmx::RVec*>(x.data()), n, coordinatesReadyOnDeviceEvent);
281                 }
282             }
283             else
284             {
285                 MPI_Isend(x.data(),
286                           n * sizeof(rvec),
287                           MPI_BYTE,
288                           dd->pme_nodeid,
289                           eCommType_COORD,
290                           cr->mpi_comm_mysim,
291                           &dd->req_pme[dd->nreq_pme++]);
292             }
293         }
294     }
295 #else
296     GMX_UNUSED_VALUE(fr);
297     GMX_UNUSED_VALUE(chargeA);
298     GMX_UNUSED_VALUE(chargeB);
299     GMX_UNUSED_VALUE(c6A);
300     GMX_UNUSED_VALUE(c6B);
301     GMX_UNUSED_VALUE(sigmaA);
302     GMX_UNUSED_VALUE(sigmaB);
303     GMX_UNUSED_VALUE(x);
304     GMX_UNUSED_VALUE(reinitGpuPmePpComms);
305     GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
306     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
307 #endif
308     if (!c_useDelayedWait)
309     {
310         /* Wait for the data to arrive */
311         /* We can skip this wait as we are sure x and q will not be modified
312          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
313          */
314         gmx_pme_send_coeffs_coords_wait(dd);
315     }
316 }
317
318 void gmx_pme_send_parameters(const t_commrec*           cr,
319                              const interaction_const_t& interactionConst,
320                              bool                       bFreeEnergy_q,
321                              bool                       bFreeEnergy_lj,
322                              gmx::ArrayRef<const real>  chargeA,
323                              gmx::ArrayRef<const real>  chargeB,
324                              gmx::ArrayRef<const real>  sqrt_c6A,
325                              gmx::ArrayRef<const real>  sqrt_c6B,
326                              gmx::ArrayRef<const real>  sigmaA,
327                              gmx::ArrayRef<const real>  sigmaB,
328                              int                        maxshift_x,
329                              int                        maxshift_y)
330 {
331     unsigned int flags = 0;
332
333     if (EEL_PME(interactionConst.eeltype))
334     {
335         flags |= PP_PME_CHARGE;
336     }
337     if (EVDW_PME(interactionConst.vdwtype))
338     {
339         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
340     }
341     if (bFreeEnergy_q || bFreeEnergy_lj)
342     {
343         /* Assumes that the B state flags are in the bits just above
344          * the ones for the A state. */
345         flags |= (flags << 1);
346     }
347
348     gmx_pme_send_coeffs_coords(nullptr,
349                                cr,
350                                flags,
351                                chargeA,
352                                chargeB,
353                                sqrt_c6A,
354                                sqrt_c6B,
355                                sigmaA,
356                                sigmaB,
357                                nullptr,
358                                gmx::ArrayRef<gmx::RVec>(),
359                                0,
360                                0,
361                                maxshift_x,
362                                maxshift_y,
363                                -1,
364                                false,
365                                false,
366                                false,
367                                false,
368                                nullptr);
369 }
370
371 void gmx_pme_send_coordinates(t_forcerec*                    fr,
372                               const t_commrec*               cr,
373                               const matrix                   box,
374                               gmx::ArrayRef<const gmx::RVec> x,
375                               real                           lambda_q,
376                               real                           lambda_lj,
377                               bool                           computeEnergyAndVirial,
378                               int64_t                        step,
379                               bool                           useGpuPmePpComms,
380                               bool                           receiveCoordinateAddressFromPme,
381                               bool                           sendCoordinatesFromGpu,
382                               bool                           receiveForcesToGpu,
383                               GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent,
384                               gmx_wallcycle*                 wcycle)
385 {
386     wallcycle_start(wcycle, WallCycleCounter::PpPmeSendX);
387
388     unsigned int flags = PP_PME_COORD;
389     if (computeEnergyAndVirial)
390     {
391         flags |= PP_PME_ENER_VIR;
392     }
393     gmx_pme_send_coeffs_coords(fr,
394                                cr,
395                                flags,
396                                {},
397                                {},
398                                {},
399                                {},
400                                {},
401                                {},
402                                box,
403                                x,
404                                lambda_q,
405                                lambda_lj,
406                                0,
407                                0,
408                                step,
409                                useGpuPmePpComms,
410                                receiveCoordinateAddressFromPme,
411                                sendCoordinatesFromGpu,
412                                receiveForcesToGpu,
413                                coordinatesReadyOnDeviceEvent);
414
415     wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
416 }
417
418 void gmx_pme_send_finish(const t_commrec* cr)
419 {
420     unsigned int flags = PP_PME_FINISH;
421
422     gmx_pme_send_coeffs_coords(
423             nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, false, nullptr);
424 }
425
426 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
427 {
428 #if GMX_MPI
429     gmx_pme_comm_n_box_t cnb;
430
431     /* Only let one PP node signal each PME node */
432     if (cr->dd->pme_receive_vir_ener)
433     {
434         cnb.flags = PP_PME_SWITCHGRID;
435         copy_ivec(grid_size, cnb.grid_size);
436         cnb.ewaldcoeff_q  = ewaldcoeff_q;
437         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
438
439         /* We send this, uncommon, message blocking to simplify the code */
440         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
441     }
442 #else
443     GMX_UNUSED_VALUE(cr);
444     GMX_UNUSED_VALUE(grid_size);
445     GMX_UNUSED_VALUE(ewaldcoeff_q);
446     GMX_UNUSED_VALUE(ewaldcoeff_lj);
447 #endif
448 }
449
450 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
451 {
452 #if GMX_MPI
453     gmx_pme_comm_n_box_t cnb;
454
455     /* Only let one PP node signal each PME node */
456     if (cr->dd->pme_receive_vir_ener)
457     {
458         cnb.flags = PP_PME_RESETCOUNTERS;
459         cnb.step  = step;
460
461         /* We send this, uncommon, message blocking to simplify the code */
462         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
463     }
464 #endif
465 }
466
467 /*! \brief Receive virial and energy from PME rank */
468 static void receive_virial_energy(const t_commrec*      cr,
469                                   gmx::ForceWithVirial* forceWithVirial,
470                                   real*                 energy_q,
471                                   real*                 energy_lj,
472                                   real*                 dvdlambda_q,
473                                   real*                 dvdlambda_lj,
474                                   float*                pme_cycles)
475 {
476     gmx_pme_comm_vir_ene_t cve;
477
478     if (cr->dd->pme_receive_vir_ener)
479     {
480         if (debug)
481         {
482             fprintf(debug,
483                     "PP rank %d receiving from PME rank %d: virial and energy\n",
484                     cr->sim_nodeid,
485                     cr->dd->pme_nodeid);
486         }
487 #if GMX_MPI
488         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
489 #else
490         memset(&cve, 0, sizeof(cve));
491 #endif
492
493         forceWithVirial->addVirialContribution(cve.vir_q);
494         forceWithVirial->addVirialContribution(cve.vir_lj);
495         *energy_q  = cve.energy_q;
496         *energy_lj = cve.energy_lj;
497         *dvdlambda_q += cve.dvdlambda_q;
498         *dvdlambda_lj += cve.dvdlambda_lj;
499         *pme_cycles = cve.cycles;
500
501         if (cve.stop_cond != StopCondition::None)
502         {
503             gmx_set_stop_condition(cve.stop_cond);
504         }
505     }
506     else
507     {
508         *energy_q   = 0;
509         *energy_lj  = 0;
510         *pme_cycles = 0;
511     }
512 }
513
514 /*! \brief Recieve force data from PME ranks */
515 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
516                          void*              recvptr,
517                          int                n,
518                          const t_commrec*   cr,
519                          bool               useGpuPmePpComms,
520                          bool               receivePmeForceToGpu)
521 {
522     if (useGpuPmePpComms)
523     {
524         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
525         // Receive forces from PME rank
526         pmePpCommGpu->receiveForceFromPme(static_cast<gmx::RVec*>(recvptr), n, receivePmeForceToGpu);
527     }
528     else
529     {
530         // Receive data using MPI
531 #if GMX_MPI
532         MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
533 #else
534         GMX_UNUSED_VALUE(cr);
535         GMX_UNUSED_VALUE(n);
536 #endif
537     }
538 }
539
540
541 void gmx_pme_receive_f(gmx::PmePpCommGpu*    pmePpCommGpu,
542                        const t_commrec*      cr,
543                        gmx::ForceWithVirial* forceWithVirial,
544                        real*                 energy_q,
545                        real*                 energy_lj,
546                        real*                 dvdlambda_q,
547                        real*                 dvdlambda_lj,
548                        bool                  useGpuPmePpComms,
549                        bool                  receivePmeForceToGpu,
550                        float*                pme_cycles)
551 {
552     if (c_useDelayedWait)
553     {
554         /* Wait for the x request to finish */
555         gmx_pme_send_coeffs_coords_wait(cr->dd);
556     }
557
558     const int               natoms = dd_numHomeAtoms(*cr->dd);
559     std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
560     buffer.resize(natoms);
561
562     void* recvptr = reinterpret_cast<void*>(buffer.data());
563     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
564
565     int nt = gmx_omp_nthreads_get_simple_rvec_task(ModuleMultiThread::Default, natoms);
566
567     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
568
569     if (!receivePmeForceToGpu)
570     {
571         /* Note that we would like to avoid this conditional by putting it
572          * into the omp pragma instead, but then we still take the full
573          * omp parallel for overhead (at least with gcc5).
574          */
575         if (nt == 1)
576         {
577             for (int i = 0; i < natoms; i++)
578             {
579                 f[i] += buffer[i];
580             }
581         }
582         else
583         {
584 #pragma omp parallel for num_threads(nt) schedule(static)
585             for (int i = 0; i < natoms; i++)
586             {
587                 f[i] += buffer[i];
588             }
589         }
590     }
591
592     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
593 }