src/gromacs/ewald/pme_pp.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief This file contains function definitions necessary for
  40  * managing the offload of long-ranged PME work to separate MPI rank,
  41  * for computing energies and forces (Coulomb and LJ).
  42  *
  43  * \author Berk Hess <hess@kth.se>
  44  * \ingroup module_ewald
  45  */
  46
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #include <cstdio>
  52 #include <cstring>
  53
  54 #include "gromacs/domdec/domdec.h"
  55 #include "gromacs/domdec/domdec_struct.h"
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/ewald/pme_pp_comm_gpu.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/math/vec.h"
  60 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  61 #include "gromacs/mdtypes/commrec.h"
  62 #include "gromacs/mdtypes/forceoutput.h"
  63 #include "gromacs/mdtypes/forcerec.h"
  64 #include "gromacs/mdtypes/interaction_const.h"
  65 #include "gromacs/mdtypes/md_enums.h"
  66 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
  67 #include "gromacs/nbnxm/nbnxm.h"
  68 #include "gromacs/timing/wallcycle.h"
  69 #include "gromacs/utility/fatalerror.h"
  70 #include "gromacs/utility/gmxmpi.h"
  71 #include "gromacs/utility/smalloc.h"
  72
  73 #include "pme_internal.h"
  74 #include "pme_pp_communication.h"
  75
  76 /*! \brief Block to wait for communication to PME ranks to complete
  77  *
  78  * This should be faster with a real non-blocking MPI implementation
  79  */
  80 static constexpr bool c_useDelayedWait = false;
  81
  82 /*! \brief Wait for the pending data send requests to PME ranks to complete */
  83 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t *dd)
  84 {
  85     if (dd->nreq_pme)
  86     {
  87 #if GMX_MPI
  88         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
  89 #endif
  90         dd->nreq_pme = 0;
  91     }
  92 }
  93
  94 /*! \brief Send data to PME ranks */
  95 static void gmx_pme_send_coeffs_coords(t_forcerec *fr, const t_commrec *cr, unsigned int flags,
  96                                        real gmx_unused *chargeA, real gmx_unused *chargeB,
  97                                        real gmx_unused *c6A, real gmx_unused *c6B,
  98                                        real gmx_unused *sigmaA, real gmx_unused *sigmaB,
  99                                        const matrix box, const rvec gmx_unused *x,
 100                                        real lambda_q, real lambda_lj,
 101                                        int maxshift_x, int maxshift_y,
 102                                        int64_t step, bool useGpuPmePpComms,
 103                                        bool reinitGpuPmePpComms,
 104                                        bool sendCoordinatesFromGpu,
 105                                        GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
 106 {
 107     gmx_domdec_t         *dd;
 108     gmx_pme_comm_n_box_t *cnb;
 109     int                   n;
 110
 111     dd = cr->dd;
 112     n  = dd_numHomeAtoms(*dd);
 113
 114     if (debug)
 115     {
 116         fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n",
 117                 cr->sim_nodeid, dd->pme_nodeid, n,
 118                 (flags & PP_PME_CHARGE) ? " charges" : "",
 119                 (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
 120                 (flags & PP_PME_SIGMA)  ? " sigma" : "",
 121                 (flags & PP_PME_COORD)  ? " coordinates" : "");
 122     }
 123
 124     if (useGpuPmePpComms)
 125     {
 126         flags |= PP_PME_GPUCOMMS;
 127     }
 128
 129     if (c_useDelayedWait)
 130     {
 131         /* We can not use cnb until pending communication has finished */
 132         gmx_pme_send_coeffs_coords_wait(dd);
 133     }
 134
 135     if (dd->pme_receive_vir_ener)
 136     {
 137         /* Peer PP node: communicate all data */
 138         if (dd->cnb == nullptr)
 139         {
 140             snew(dd->cnb, 1);
 141         }
 142         cnb = dd->cnb;
 143
 144         cnb->flags      = flags;
 145         cnb->natoms     = n;
 146         cnb->maxshift_x = maxshift_x;
 147         cnb->maxshift_y = maxshift_y;
 148         cnb->lambda_q   = lambda_q;
 149         cnb->lambda_lj  = lambda_lj;
 150         cnb->step       = step;
 151         if (flags & PP_PME_COORD)
 152         {
 153             copy_mat(box, cnb->box);
 154         }
 155 #if GMX_MPI
 156         MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE,
 157                   dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
 158                   &dd->req_pme[dd->nreq_pme++]);
 159 #endif
 160     }
 161     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
 162     {
 163 #if GMX_MPI
 164         /* Communicate only the number of atoms */
 165         MPI_Isend(&n, sizeof(n), MPI_BYTE,
 166                   dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
 167                   &dd->req_pme[dd->nreq_pme++]);
 168 #endif
 169     }
 170
 171 #if GMX_MPI
 172     if (n > 0)
 173     {
 174         if (flags & PP_PME_CHARGE)
 175         {
 176             MPI_Isend(chargeA, n*sizeof(real), MPI_BYTE,
 177                       dd->pme_nodeid, eCommType_ChargeA, cr->mpi_comm_mysim,
 178                       &dd->req_pme[dd->nreq_pme++]);
 179         }
 180         if (flags & PP_PME_CHARGEB)
 181         {
 182             MPI_Isend(chargeB, n*sizeof(real), MPI_BYTE,
 183                       dd->pme_nodeid, eCommType_ChargeB, cr->mpi_comm_mysim,
 184                       &dd->req_pme[dd->nreq_pme++]);
 185         }
 186         if (flags & PP_PME_SQRTC6)
 187         {
 188             MPI_Isend(c6A, n*sizeof(real), MPI_BYTE,
 189                       dd->pme_nodeid, eCommType_SQRTC6A, cr->mpi_comm_mysim,
 190                       &dd->req_pme[dd->nreq_pme++]);
 191         }
 192         if (flags & PP_PME_SQRTC6B)
 193         {
 194             MPI_Isend(c6B, n*sizeof(real), MPI_BYTE,
 195                       dd->pme_nodeid, eCommType_SQRTC6B, cr->mpi_comm_mysim,
 196                       &dd->req_pme[dd->nreq_pme++]);
 197         }
 198         if (flags & PP_PME_SIGMA)
 199         {
 200             MPI_Isend(sigmaA, n*sizeof(real), MPI_BYTE,
 201                       dd->pme_nodeid, eCommType_SigmaA, cr->mpi_comm_mysim,
 202                       &dd->req_pme[dd->nreq_pme++]);
 203         }
 204         if (flags & PP_PME_SIGMAB)
 205         {
 206             MPI_Isend(sigmaB, n*sizeof(real), MPI_BYTE,
 207                       dd->pme_nodeid, eCommType_SigmaB, cr->mpi_comm_mysim,
 208                       &dd->req_pme[dd->nreq_pme++]);
 209         }
 210         if (flags & PP_PME_COORD)
 211         {
 212             if (reinitGpuPmePpComms)
 213             {
 214                 fr->pmePpCommGpu->reinit(n);
 215             }
 216
 217
 218             /* MPI_Isend does not accept a const buffer pointer */
 219             real *xRealPtr = const_cast<real *>(x[0]);
 220             if (useGpuPmePpComms && (fr != nullptr))
 221             {
 222                 void *sendPtr = sendCoordinatesFromGpu ? static_cast<void*> (fr->stateGpu->getCoordinates()) :
 223                     static_cast<void*> (xRealPtr);
 224                 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
 225             }
 226             else
 227             {
 228                 MPI_Isend(xRealPtr, n*sizeof(rvec), MPI_BYTE,
 229                           dd->pme_nodeid, eCommType_COORD, cr->mpi_comm_mysim,
 230                           &dd->req_pme[dd->nreq_pme++]);
 231             }
 232         }
 233     }
 234
 235 #endif
 236     if (!c_useDelayedWait)
 237     {
 238         /* Wait for the data to arrive */
 239         /* We can skip this wait as we are sure x and q will not be modified
 240          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
 241          */
 242         gmx_pme_send_coeffs_coords_wait(dd);
 243     }
 244 }
 245
 246 void gmx_pme_send_parameters(const t_commrec *cr,
 247                              const interaction_const_t *ic,
 248                              gmx_bool bFreeEnergy_q, gmx_bool bFreeEnergy_lj,
 249                              real *chargeA, real *chargeB,
 250                              real *sqrt_c6A, real *sqrt_c6B,
 251                              real *sigmaA, real *sigmaB,
 252                              int maxshift_x, int maxshift_y)
 253 {
 254     unsigned int flags = 0;
 255
 256     if (EEL_PME(ic->eeltype))
 257     {
 258         flags |= PP_PME_CHARGE;
 259     }
 260     if (EVDW_PME(ic->vdwtype))
 261     {
 262         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
 263     }
 264     if (bFreeEnergy_q || bFreeEnergy_lj)
 265     {
 266         /* Assumes that the B state flags are in the bits just above
 267          * the ones for the A state. */
 268         flags |= (flags << 1);
 269     }
 270
 271     gmx_pme_send_coeffs_coords(nullptr, cr, flags,
 272                                chargeA, chargeB,
 273                                sqrt_c6A, sqrt_c6B, sigmaA, sigmaB,
 274                                nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false, nullptr);
 275 }
 276
 277 void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix box, const rvec *x,
 278                               real lambda_q, real lambda_lj,
 279                               gmx_bool bEnerVir,
 280                               int64_t step, bool useGpuPmePpComms,
 281                               bool receiveCoordinateAddressFromPme,
 282                               bool sendCoordinatesFromGpu,
 283                               GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle)
 284 {
 285     wallcycle_start(wcycle, ewcPP_PMESENDX);
 286
 287     unsigned int flags = PP_PME_COORD;
 288     if (bEnerVir)
 289     {
 290         flags |= PP_PME_ENER_VIR;
 291     }
 292     gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 293                                box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms, receiveCoordinateAddressFromPme,
 294                                sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
 295
 296     wallcycle_stop(wcycle, ewcPP_PMESENDX);
 297 }
 298
 299 void gmx_pme_send_finish(const t_commrec *cr)
 300 {
 301     unsigned int flags = PP_PME_FINISH;
 302
 303     gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
 304 }
 305
 306 void gmx_pme_send_switchgrid(const t_commrec *cr,
 307                              ivec             grid_size,
 308                              real             ewaldcoeff_q,
 309                              real             ewaldcoeff_lj)
 310 {
 311 #if GMX_MPI
 312     gmx_pme_comm_n_box_t cnb;
 313
 314     /* Only let one PP node signal each PME node */
 315     if (cr->dd->pme_receive_vir_ener)
 316     {
 317         cnb.flags = PP_PME_SWITCHGRID;
 318         copy_ivec(grid_size, cnb.grid_size);
 319         cnb.ewaldcoeff_q  = ewaldcoeff_q;
 320         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
 321
 322         /* We send this, uncommon, message blocking to simplify the code */
 323         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
 324                  cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
 325     }
 326 #else
 327     GMX_UNUSED_VALUE(cr);
 328     GMX_UNUSED_VALUE(grid_size);
 329     GMX_UNUSED_VALUE(ewaldcoeff_q);
 330     GMX_UNUSED_VALUE(ewaldcoeff_lj);
 331 #endif
 332 }
 333
 334 void gmx_pme_send_resetcounters(const t_commrec gmx_unused *cr, int64_t gmx_unused step)
 335 {
 336 #if GMX_MPI
 337     gmx_pme_comm_n_box_t cnb;
 338
 339     /* Only let one PP node signal each PME node */
 340     if (cr->dd->pme_receive_vir_ener)
 341     {
 342         cnb.flags = PP_PME_RESETCOUNTERS;
 343         cnb.step  = step;
 344
 345         /* We send this, uncommon, message blocking to simplify the code */
 346         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE,
 347                  cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
 348     }
 349 #endif
 350 }
 351
 352 /*! \brief Receive virial and energy from PME rank */
 353 static void receive_virial_energy(const t_commrec *cr,
 354                                   gmx::ForceWithVirial *forceWithVirial,
 355                                   real *energy_q, real *energy_lj,
 356                                   real *dvdlambda_q, real *dvdlambda_lj,
 357                                   float *pme_cycles)
 358 {
 359     gmx_pme_comm_vir_ene_t cve;
 360
 361     if (cr->dd->pme_receive_vir_ener)
 362     {
 363         if (debug)
 364         {
 365             fprintf(debug,
 366                     "PP rank %d receiving from PME rank %d: virial and energy\n",
 367                     cr->sim_nodeid, cr->dd->pme_nodeid);
 368         }
 369 #if GMX_MPI
 370         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim,
 371                  MPI_STATUS_IGNORE);
 372 #else
 373         memset(&cve, 0, sizeof(cve));
 374 #endif
 375
 376         forceWithVirial->addVirialContribution(cve.vir_q);
 377         forceWithVirial->addVirialContribution(cve.vir_lj);
 378         *energy_q      = cve.energy_q;
 379         *energy_lj     = cve.energy_lj;
 380         *dvdlambda_q  += cve.dvdlambda_q;
 381         *dvdlambda_lj += cve.dvdlambda_lj;
 382         *pme_cycles    = cve.cycles;
 383
 384         if (cve.stop_cond != gmx_stop_cond_none)
 385         {
 386             gmx_set_stop_condition(cve.stop_cond);
 387         }
 388     }
 389     else
 390     {
 391         *energy_q   = 0;
 392         *energy_lj  = 0;
 393         *pme_cycles = 0;
 394     }
 395 }
 396
 397 /*! \brief Recieve force data from PME ranks */
 398 static void recvFFromPme(gmx::PmePpCommGpu *pmePpCommGpu,
 399                          void             * recvptr,
 400                          int                n,
 401                          const t_commrec   *cr,
 402                          bool               useGpuPmePpComms,
 403                          bool               receivePmeForceToGpu)
 404 {
 405     if (useGpuPmePpComms)
 406     {
 407         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
 408         //Receive directly using CUDA memory copy
 409         pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
 410     }
 411     else
 412     {
 413         //Receive data using MPI
 414 #if GMX_MPI
 415         MPI_Recv(recvptr, n*sizeof(rvec), MPI_BYTE,
 416                  cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
 417                  MPI_STATUS_IGNORE);
 418 #endif
 419     }
 420 }
 421
 422
 423 void gmx_pme_receive_f(gmx::PmePpCommGpu *pmePpCommGpu,
 424                        const t_commrec *cr,
 425                        gmx::ForceWithVirial *forceWithVirial,
 426                        real *energy_q, real *energy_lj,
 427                        real *dvdlambda_q, real *dvdlambda_lj,
 428                        bool useGpuPmePpComms, bool receivePmeForceToGpu,
 429                        float *pme_cycles)
 430 {
 431     if (c_useDelayedWait)
 432     {
 433         /* Wait for the x request to finish */
 434         gmx_pme_send_coeffs_coords_wait(cr->dd);
 435     }
 436
 437     const int               natoms = dd_numHomeAtoms(*cr->dd);
 438     std::vector<gmx::RVec> &buffer = cr->dd->pmeForceReceiveBuffer;
 439     buffer.resize(natoms);
 440
 441     void *recvptr = reinterpret_cast<void*>(buffer.data());
 442     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
 443
 444     int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
 445
 446     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
 447
 448     if (!receivePmeForceToGpu)
 449     {
 450         /* Note that we would like to avoid this conditional by putting it
 451          * into the omp pragma instead, but then we still take the full
 452          * omp parallel for overhead (at least with gcc5).
 453          */
 454         if (nt == 1)
 455         {
 456             for (int i = 0; i < natoms; i++)
 457             {
 458                 f[i] += buffer[i];
 459             }
 460         }
 461         else
 462         {
 463 #pragma omp parallel for num_threads(nt) schedule(static)
 464             for (int i = 0; i < natoms; i++)
 465             {
 466                 f[i] += buffer[i];
 467             }
 468         }
 469     }
 470
 471     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
 472 }