Bug fix for event passed to GPU comm features
[alexxy/gromacs.git] / src / gromacs / ewald / pme_pp.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief This file contains function definitions necessary for
40  * managing the offload of long-ranged PME work to separate MPI rank,
41  * for computing energies and forces (Coulomb and LJ).
42  *
43  * \author Berk Hess <hess@kth.se>
44  * \ingroup module_ewald
45  */
46
47 #include "gmxpre.h"
48
49 #include "config.h"
50
51 #include <cstdio>
52 #include <cstring>
53
54 #include "gromacs/domdec/domdec.h"
55 #include "gromacs/domdec/domdec_struct.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/ewald/pme_pp_comm_gpu.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/math/vec.h"
60 #include "gromacs/mdlib/gmx_omp_nthreads.h"
61 #include "gromacs/mdtypes/commrec.h"
62 #include "gromacs/mdtypes/forceoutput.h"
63 #include "gromacs/mdtypes/forcerec.h"
64 #include "gromacs/mdtypes/interaction_const.h"
65 #include "gromacs/mdtypes/md_enums.h"
66 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
67 #include "gromacs/nbnxm/nbnxm.h"
68 #include "gromacs/timing/wallcycle.h"
69 #include "gromacs/utility/fatalerror.h"
70 #include "gromacs/utility/gmxmpi.h"
71 #include "gromacs/utility/smalloc.h"
72
73 #include "pme_internal.h"
74 #include "pme_pp_communication.h"
75
76 /*! \brief Block to wait for communication to PME ranks to complete
77  *
78  * This should be faster with a real non-blocking MPI implementation
79  */
80 static constexpr bool c_useDelayedWait = false;
81
82 /*! \brief Wait for the pending data send requests to PME ranks to complete */
83 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t *dd)
84 {
85     if (dd->nreq_pme)
86     {
87 #if GMX_MPI
88         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
89 #endif
90         dd->nreq_pme = 0;
91     }
92 }
93
94 /*! \brief Send data to PME ranks */
95 static void gmx_pme_send_coeffs_coords(t_forcerec *fr, const t_commrec *cr, unsigned int flags,
96                                        real gmx_unused *chargeA, real gmx_unused *chargeB,
97                                        real gmx_unused *c6A, real gmx_unused *c6B,
98                                        real gmx_unused *sigmaA, real gmx_unused *sigmaB,
99                                        const matrix box, const rvec gmx_unused *x,
100                                        real lambda_q, real lambda_lj,
101                                        int maxshift_x, int maxshift_y,
102                                        int64_t step, bool useGpuPmePpComms,
103                                        bool reinitGpuPmePpComms,
104                                        bool sendCoordinatesFromGpu,
105                                        GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
106 {
107     gmx_domdec_t         *dd;
108     gmx_pme_comm_n_box_t *cnb;
109     int                   n;
110
111     dd = cr->dd;
112     n  = dd_numHomeAtoms(*dd);
113
114     if (debug)
115     {
116         fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n",
117                 cr->sim_nodeid, dd->pme_nodeid, n,
118                 (flags & PP_PME_CHARGE) ? " charges" : "",
119                 (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
120                 (flags & PP_PME_SIGMA)  ? " sigma" : "",
121                 (flags & PP_PME_COORD)  ? " coordinates" : "");
122     }
123
124     if (useGpuPmePpComms)
125     {
126         flags |= PP_PME_GPUCOMMS;
127     }
128
129     if (c_useDelayedWait)
130     {
131         /* We can not use cnb until pending communication has finished */
132         gmx_pme_send_coeffs_coords_wait(dd);
133     }
134
135     if (dd->pme_receive_vir_ener)
136     {
137         /* Peer PP node: communicate all data */
138         if (dd->cnb == nullptr)
139         {
140             snew(dd->cnb, 1);
141         }
142         cnb = dd->cnb;
143
144         cnb->flags      = flags;
145         cnb->natoms     = n;
146         cnb->maxshift_x = maxshift_x;
147         cnb->maxshift_y = maxshift_y;
148         cnb->lambda_q   = lambda_q;
149         cnb->lambda_lj  = lambda_lj;
150         cnb->step       = step;
151         if (flags & PP_PME_COORD)
152         {
153             copy_mat(box, cnb->box);
154         }
155 #if GMX_MPI
156         MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE,
157                   dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
158                   &dd->req_pme[dd->nreq_pme++]);
159 #endif
160     }
161     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
162     {
163 #if GMX_MPI
164         /* Communicate only the number of atoms */
165         MPI_Isend(&n, sizeof(n), MPI_BYTE,
166                   dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
167                   &dd->req_pme[dd->nreq_pme++]);
168 #endif
169     }
170
171 #if GMX_MPI
172     if (n > 0)
173     {
174         if (flags & PP_PME_CHARGE)
175         {
176             MPI_Isend(chargeA, n*sizeof(real), MPI_BYTE,
177                       dd->pme_nodeid, eCommType_ChargeA, cr->mpi_comm_mysim,
178                       &dd->req_pme[dd->nreq_pme++]);
179         }
180         if (flags & PP_PME_CHARGEB)
181         {
182             MPI_Isend(chargeB, n*sizeof(real), MPI_BYTE,
183                       dd->pme_nodeid, eCommType_ChargeB, cr->mpi_comm_mysim,
184                       &dd->req_pme[dd->nreq_pme++]);
185         }
186         if (flags & PP_PME_SQRTC6)
187         {
188             MPI_Isend(c6A, n*sizeof(real), MPI_BYTE,
189                       dd->pme_nodeid, eCommType_SQRTC6A, cr->mpi_comm_mysim,
190                       &dd->req_pme[dd->nreq_pme++]);
191         }
192         if (flags & PP_PME_SQRTC6B)
193         {
194             MPI_Isend(c6B, n*sizeof(real), MPI_BYTE,
195                       dd->pme_nodeid, eCommType_SQRTC6B, cr->mpi_comm_mysim,
196                       &dd->req_pme[dd->nreq_pme++]);
197         }
198         if (flags & PP_PME_SIGMA)
199         {
200             MPI_Isend(sigmaA, n*sizeof(real), MPI_BYTE,
201                       dd->pme_nodeid, eCommType_SigmaA, cr->mpi_comm_mysim,
202                       &dd->req_pme[dd->nreq_pme++]);
203         }
204         if (flags & PP_PME_SIGMAB)
205         {
206             MPI_Isend(sigmaB, n*sizeof(real), MPI_BYTE,
207                       dd->pme_nodeid, eCommType_SigmaB, cr->mpi_comm_mysim,
208                       &dd->req_pme[dd->nreq_pme++]);
209         }
210         if (flags & PP_PME_COORD)
211         {
212             if (reinitGpuPmePpComms)
213             {
214                 fr->pmePpCommGpu->reinit(n);
215             }
216
217
218             /* MPI_Isend does not accept a const buffer pointer */
219             real *xRealPtr = const_cast<real *>(x[0]);
220             if (useGpuPmePpComms && (fr != nullptr))
221             {
222                 void *sendPtr = sendCoordinatesFromGpu ? static_cast<void*> (fr->stateGpu->getCoordinates()) :
223                     static_cast<void*> (xRealPtr);
224                 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
225             }
226             else
227             {
228                 MPI_Isend(xRealPtr, n*sizeof(rvec), MPI_BYTE,
229                           dd->pme_nodeid, eCommType_COORD, cr->mpi_comm_mysim,
230                           &dd->req_pme[dd->nreq_pme++]);
231             }
232         }
233     }
234
235 #endif
236     if (!c_useDelayedWait)
237     {
238         /* Wait for the data to arrive */
239         /* We can skip this wait as we are sure x and q will not be modified
240          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
241          */
242         gmx_pme_send_coeffs_coords_wait(dd);
243     }
244 }
245
246 void gmx_pme_send_parameters(const t_commrec *cr,
247                              const interaction_const_t *ic,
248                              gmx_bool bFreeEnergy_q, gmx_bool bFreeEnergy_lj,
249                              real *chargeA, real *chargeB,
250                              real *sqrt_c6A, real *sqrt_c6B,
251                              real *sigmaA, real *sigmaB,
252                              int maxshift_x, int maxshift_y)
253 {
254     unsigned int flags = 0;
255
256     if (EEL_PME(ic->eeltype))
257     {
258         flags |= PP_PME_CHARGE;
259     }
260     if (EVDW_PME(ic->vdwtype))
261     {
262         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
263     }
264     if (bFreeEnergy_q || bFreeEnergy_lj)
265     {
266         /* Assumes that the B state flags are in the bits just above
267          * the ones for the A state. */
268         flags |= (flags << 1);
269     }
270
271     gmx_pme_send_coeffs_coords(nullptr, cr, flags,
272                                chargeA, chargeB,
273                                sqrt_c6A, sqrt_c6B, sigmaA, sigmaB,
274                                nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false, nullptr);
275 }
276
277 void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix box, const rvec *x,
278                               real lambda_q, real lambda_lj,
279                               gmx_bool bEnerVir,
280                               int64_t step, bool useGpuPmePpComms,
281                               bool receiveCoordinateAddressFromPme,
282                               bool sendCoordinatesFromGpu,
283                               GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle)
284 {
285     wallcycle_start(wcycle, ewcPP_PMESENDX);
286
287     unsigned int flags = PP_PME_COORD;
288     if (bEnerVir)
289     {
290         flags |= PP_PME_ENER_VIR;
291     }
292     gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
293                                box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms, receiveCoordinateAddressFromPme,
294                                sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
295
296     wallcycle_stop(wcycle, ewcPP_PMESENDX);
297 }
298
299 void gmx_pme_send_finish(const t_commrec *cr)
300 {
301     unsigned int flags = PP_PME_FINISH;
302
303     gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
304 }
305
306 void gmx_pme_send_switchgrid(const t_commrec *cr,
307                              ivec             grid_size,
308                              real             ewaldcoeff_q,
309                              real             ewaldcoeff_lj)
310 {
311 #if GMX_MPI
312     gmx_pme_comm_n_box_t cnb;
313
314     /* Only let one PP node signal each PME node */
315     if (cr->dd->pme_receive_vir_ener)
316     {
317         cnb.flags = PP_PME_SWITCHGRID;
318         copy_ivec(grid_size, cnb.grid_size);
319         cnb.ewaldcoeff_q  = ewaldcoeff_q;
320         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
321
322         /* We send this, uncommon, message blocking to simplify the code */
323         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
324                  cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
325     }
326 #else
327     GMX_UNUSED_VALUE(cr);
328     GMX_UNUSED_VALUE(grid_size);
329     GMX_UNUSED_VALUE(ewaldcoeff_q);
330     GMX_UNUSED_VALUE(ewaldcoeff_lj);
331 #endif
332 }
333
334 void gmx_pme_send_resetcounters(const t_commrec gmx_unused *cr, int64_t gmx_unused step)
335 {
336 #if GMX_MPI
337     gmx_pme_comm_n_box_t cnb;
338
339     /* Only let one PP node signal each PME node */
340     if (cr->dd->pme_receive_vir_ener)
341     {
342         cnb.flags = PP_PME_RESETCOUNTERS;
343         cnb.step  = step;
344
345         /* We send this, uncommon, message blocking to simplify the code */
346         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
347                  cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
348     }
349 #endif
350 }
351
352 /*! \brief Receive virial and energy from PME rank */
353 static void receive_virial_energy(const t_commrec *cr,
354                                   gmx::ForceWithVirial *forceWithVirial,
355                                   real *energy_q, real *energy_lj,
356                                   real *dvdlambda_q, real *dvdlambda_lj,
357                                   float *pme_cycles)
358 {
359     gmx_pme_comm_vir_ene_t cve;
360
361     if (cr->dd->pme_receive_vir_ener)
362     {
363         if (debug)
364         {
365             fprintf(debug,
366                     "PP rank %d receiving from PME rank %d: virial and energy\n",
367                     cr->sim_nodeid, cr->dd->pme_nodeid);
368         }
369 #if GMX_MPI
370         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim,
371                  MPI_STATUS_IGNORE);
372 #else
373         memset(&cve, 0, sizeof(cve));
374 #endif
375
376         forceWithVirial->addVirialContribution(cve.vir_q);
377         forceWithVirial->addVirialContribution(cve.vir_lj);
378         *energy_q      = cve.energy_q;
379         *energy_lj     = cve.energy_lj;
380         *dvdlambda_q  += cve.dvdlambda_q;
381         *dvdlambda_lj += cve.dvdlambda_lj;
382         *pme_cycles    = cve.cycles;
383
384         if (cve.stop_cond != gmx_stop_cond_none)
385         {
386             gmx_set_stop_condition(cve.stop_cond);
387         }
388     }
389     else
390     {
391         *energy_q   = 0;
392         *energy_lj  = 0;
393         *pme_cycles = 0;
394     }
395 }
396
397 /*! \brief Recieve force data from PME ranks */
398 static void recvFFromPme(gmx::PmePpCommGpu *pmePpCommGpu,
399                          void             * recvptr,
400                          int                n,
401                          const t_commrec   *cr,
402                          bool               useGpuPmePpComms,
403                          bool               receivePmeForceToGpu)
404 {
405     if (useGpuPmePpComms)
406     {
407         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
408         //Receive directly using CUDA memory copy
409         pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
410     }
411     else
412     {
413         //Receive data using MPI
414 #if GMX_MPI
415         MPI_Recv(recvptr, n*sizeof(rvec), MPI_BYTE,
416                  cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
417                  MPI_STATUS_IGNORE);
418 #endif
419     }
420 }
421
422
423 void gmx_pme_receive_f(gmx::PmePpCommGpu *pmePpCommGpu,
424                        const t_commrec *cr,
425                        gmx::ForceWithVirial *forceWithVirial,
426                        real *energy_q, real *energy_lj,
427                        real *dvdlambda_q, real *dvdlambda_lj,
428                        bool useGpuPmePpComms, bool receivePmeForceToGpu,
429                        float *pme_cycles)
430 {
431     if (c_useDelayedWait)
432     {
433         /* Wait for the x request to finish */
434         gmx_pme_send_coeffs_coords_wait(cr->dd);
435     }
436
437     const int               natoms = dd_numHomeAtoms(*cr->dd);
438     std::vector<gmx::RVec> &buffer = cr->dd->pmeForceReceiveBuffer;
439     buffer.resize(natoms);
440
441     void *recvptr = reinterpret_cast<void*>(buffer.data());
442     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
443
444     int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
445
446     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
447
448     if (!receivePmeForceToGpu)
449     {
450         /* Note that we would like to avoid this conditional by putting it
451          * into the omp pragma instead, but then we still take the full
452          * omp parallel for overhead (at least with gcc5).
453          */
454         if (nt == 1)
455         {
456             for (int i = 0; i < natoms; i++)
457             {
458                 f[i] += buffer[i];
459             }
460         }
461         else
462         {
463 #pragma omp parallel for num_threads(nt) schedule(static)
464             for (int i = 0; i < natoms; i++)
465             {
466                 f[i] += buffer[i];
467             }
468         }
469     }
470
471     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
472 }