src/gromacs/ewald/pme_pp.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 /*! \internal \file
  39  *
  40  * \brief This file contains function definitions necessary for
  41  * managing the offload of long-ranged PME work to separate MPI rank,
  42  * for computing energies and forces (Coulomb and LJ).
  43  *
  44  * \author Berk Hess <hess@kth.se>
  45  * \ingroup module_ewald
  46  */
  47
  48 #include "gmxpre.h"
  49
  50 #include "pme_pp.h"
  51
  52 #include "config.h"
  53
  54 #include <cstdio>
  55 #include <cstring>
  56
  57 #include "gromacs/domdec/domdec.h"
  58 #include "gromacs/domdec/domdec_struct.h"
  59 #include "gromacs/ewald/pme.h"
  60 #include "gromacs/ewald/pme_pp_comm_gpu.h"
  61 #include "gromacs/gmxlib/network.h"
  62 #include "gromacs/math/vec.h"
  63 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  64 #include "gromacs/mdtypes/commrec.h"
  65 #include "gromacs/mdtypes/forceoutput.h"
  66 #include "gromacs/mdtypes/forcerec.h"
  67 #include "gromacs/mdtypes/interaction_const.h"
  68 #include "gromacs/mdtypes/md_enums.h"
  69 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
  70 #include "gromacs/nbnxm/nbnxm.h"
  71 #include "gromacs/timing/wallcycle.h"
  72 #include "gromacs/utility/fatalerror.h"
  73 #include "gromacs/utility/gmxmpi.h"
  74 #include "gromacs/utility/smalloc.h"
  75
  76 #include "pme_pp_communication.h"
  77
  78 /*! \brief Block to wait for communication to PME ranks to complete
  79  *
  80  * This should be faster with a real non-blocking MPI implementation
  81  */
  82 static constexpr bool c_useDelayedWait = false;
  83
  84 /*! \brief Wait for the pending data send requests to PME ranks to complete */
  85 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
  86 {
  87     if (dd->nreq_pme)
  88     {
  89 #if GMX_MPI
  90         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
  91 #endif
  92         dd->nreq_pme = 0;
  93     }
  94 }
  95
  96 /*! \brief Send data to PME ranks */
  97 static void gmx_pme_send_coeffs_coords(t_forcerec*                    fr,
  98                                        const t_commrec*               cr,
  99                                        unsigned int                   flags,
 100                                        gmx::ArrayRef<const real>      chargeA,
 101                                        gmx::ArrayRef<const real>      chargeB,
 102                                        gmx::ArrayRef<const real>      c6A,
 103                                        gmx::ArrayRef<const real>      c6B,
 104                                        gmx::ArrayRef<const real>      sigmaA,
 105                                        gmx::ArrayRef<const real>      sigmaB,
 106                                        const matrix                   box,
 107                                        gmx::ArrayRef<const gmx::RVec> x,
 108                                        real                           lambda_q,
 109                                        real                           lambda_lj,
 110                                        int                            maxshift_x,
 111                                        int                            maxshift_y,
 112                                        int64_t                        step,
 113                                        bool                           useGpuPmePpComms,
 114                                        bool                           reinitGpuPmePpComms,
 115                                        bool                           sendCoordinatesFromGpu,
 116                                        bool                           receiveForcesToGpu,
 117                                        GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent)
 118 {
 119     gmx_domdec_t*         dd;
 120     gmx_pme_comm_n_box_t* cnb;
 121     int                   n;
 122
 123     dd = cr->dd;
 124     n  = dd_numHomeAtoms(*dd);
 125
 126     if (debug)
 127     {
 128         fprintf(debug,
 129                 "PP rank %d sending to PME rank %d: %d%s%s%s%s\n",
 130                 cr->sim_nodeid,
 131                 dd->pme_nodeid,
 132                 n,
 133                 (flags & PP_PME_CHARGE) ? " charges" : "",
 134                 (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
 135                 (flags & PP_PME_SIGMA) ? " sigma" : "",
 136                 (flags & PP_PME_COORD) ? " coordinates" : "");
 137     }
 138
 139     if (useGpuPmePpComms)
 140     {
 141         flags |= PP_PME_GPUCOMMS;
 142         if (receiveForcesToGpu)
 143         {
 144             flags |= PP_PME_RECVFTOGPU;
 145         }
 146     }
 147
 148     if (c_useDelayedWait)
 149     {
 150         /* We can not use cnb until pending communication has finished */
 151         gmx_pme_send_coeffs_coords_wait(dd);
 152     }
 153
 154     if (dd->pme_receive_vir_ener)
 155     {
 156         /* Peer PP node: communicate all data */
 157         if (dd->cnb == nullptr)
 158         {
 159             snew(dd->cnb, 1);
 160         }
 161         cnb = dd->cnb;
 162
 163         cnb->flags      = flags;
 164         cnb->natoms     = n;
 165         cnb->maxshift_x = maxshift_x;
 166         cnb->maxshift_y = maxshift_y;
 167         cnb->lambda_q   = lambda_q;
 168         cnb->lambda_lj  = lambda_lj;
 169         cnb->step       = step;
 170         if (flags & PP_PME_COORD)
 171         {
 172             copy_mat(box, cnb->box);
 173         }
 174 #if GMX_MPI
 175         MPI_Isend(cnb,
 176                   sizeof(*cnb),
 177                   MPI_BYTE,
 178                   dd->pme_nodeid,
 179                   eCommType_CNB,
 180                   cr->mpi_comm_mysim,
 181                   &dd->req_pme[dd->nreq_pme++]);
 182 #endif
 183     }
 184     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
 185     {
 186 #if GMX_MPI
 187         /* Communicate only the number of atoms */
 188         MPI_Isend(&n,
 189                   sizeof(n),
 190                   MPI_BYTE,
 191                   dd->pme_nodeid,
 192                   eCommType_CNB,
 193                   cr->mpi_comm_mysim,
 194                   &dd->req_pme[dd->nreq_pme++]);
 195 #endif
 196     }
 197
 198 #if GMX_MPI
 199     if (n > 0)
 200     {
 201         if (flags & PP_PME_CHARGE)
 202         {
 203             MPI_Isend(chargeA.data(),
 204                       n * sizeof(real),
 205                       MPI_BYTE,
 206                       dd->pme_nodeid,
 207                       eCommType_ChargeA,
 208                       cr->mpi_comm_mysim,
 209                       &dd->req_pme[dd->nreq_pme++]);
 210         }
 211         if (flags & PP_PME_CHARGEB)
 212         {
 213             MPI_Isend(chargeB.data(),
 214                       n * sizeof(real),
 215                       MPI_BYTE,
 216                       dd->pme_nodeid,
 217                       eCommType_ChargeB,
 218                       cr->mpi_comm_mysim,
 219                       &dd->req_pme[dd->nreq_pme++]);
 220         }
 221         if (flags & PP_PME_SQRTC6)
 222         {
 223             MPI_Isend(c6A.data(),
 224                       n * sizeof(real),
 225                       MPI_BYTE,
 226                       dd->pme_nodeid,
 227                       eCommType_SQRTC6A,
 228                       cr->mpi_comm_mysim,
 229                       &dd->req_pme[dd->nreq_pme++]);
 230         }
 231         if (flags & PP_PME_SQRTC6B)
 232         {
 233             MPI_Isend(c6B.data(),
 234                       n * sizeof(real),
 235                       MPI_BYTE,
 236                       dd->pme_nodeid,
 237                       eCommType_SQRTC6B,
 238                       cr->mpi_comm_mysim,
 239                       &dd->req_pme[dd->nreq_pme++]);
 240         }
 241         if (flags & PP_PME_SIGMA)
 242         {
 243             MPI_Isend(sigmaA.data(),
 244                       n * sizeof(real),
 245                       MPI_BYTE,
 246                       dd->pme_nodeid,
 247                       eCommType_SigmaA,
 248                       cr->mpi_comm_mysim,
 249                       &dd->req_pme[dd->nreq_pme++]);
 250         }
 251         if (flags & PP_PME_SIGMAB)
 252         {
 253             MPI_Isend(sigmaB.data(),
 254                       n * sizeof(real),
 255                       MPI_BYTE,
 256                       dd->pme_nodeid,
 257                       eCommType_SigmaB,
 258                       cr->mpi_comm_mysim,
 259                       &dd->req_pme[dd->nreq_pme++]);
 260         }
 261         if (flags & PP_PME_COORD)
 262         {
 263             if (reinitGpuPmePpComms)
 264             {
 265                 std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
 266                 buffer.resize(n);
 267                 fr->pmePpCommGpu->reinit(n);
 268             }
 269
 270             if (useGpuPmePpComms && (fr != nullptr))
 271             {
 272                 if (sendCoordinatesFromGpu)
 273                 {
 274                     fr->pmePpCommGpu->sendCoordinatesToPmeFromGpu(
 275                             fr->stateGpu->getCoordinates(), n, coordinatesReadyOnDeviceEvent);
 276                 }
 277                 else
 278                 {
 279                     fr->pmePpCommGpu->sendCoordinatesToPmeFromCpu(
 280                             const_cast<gmx::RVec*>(x.data()), n, coordinatesReadyOnDeviceEvent);
 281                 }
 282             }
 283             else
 284             {
 285                 MPI_Isend(x.data(),
 286                           n * sizeof(rvec),
 287                           MPI_BYTE,
 288                           dd->pme_nodeid,
 289                           eCommType_COORD,
 290                           cr->mpi_comm_mysim,
 291                           &dd->req_pme[dd->nreq_pme++]);
 292             }
 293         }
 294     }
 295 #else
 296     GMX_UNUSED_VALUE(fr);
 297     GMX_UNUSED_VALUE(chargeA);
 298     GMX_UNUSED_VALUE(chargeB);
 299     GMX_UNUSED_VALUE(c6A);
 300     GMX_UNUSED_VALUE(c6B);
 301     GMX_UNUSED_VALUE(sigmaA);
 302     GMX_UNUSED_VALUE(sigmaB);
 303     GMX_UNUSED_VALUE(x);
 304     GMX_UNUSED_VALUE(reinitGpuPmePpComms);
 305     GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
 306     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 307 #endif
 308     if (!c_useDelayedWait)
 309     {
 310         /* Wait for the data to arrive */
 311         /* We can skip this wait as we are sure x and q will not be modified
 312          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
 313          */
 314         gmx_pme_send_coeffs_coords_wait(dd);
 315     }
 316 }
 317
 318 void gmx_pme_send_parameters(const t_commrec*           cr,
 319                              const interaction_const_t& interactionConst,
 320                              bool                       bFreeEnergy_q,
 321                              bool                       bFreeEnergy_lj,
 322                              gmx::ArrayRef<const real>  chargeA,
 323                              gmx::ArrayRef<const real>  chargeB,
 324                              gmx::ArrayRef<const real>  sqrt_c6A,
 325                              gmx::ArrayRef<const real>  sqrt_c6B,
 326                              gmx::ArrayRef<const real>  sigmaA,
 327                              gmx::ArrayRef<const real>  sigmaB,
 328                              int                        maxshift_x,
 329                              int                        maxshift_y)
 330 {
 331     unsigned int flags = 0;
 332
 333     if (EEL_PME(interactionConst.eeltype))
 334     {
 335         flags |= PP_PME_CHARGE;
 336     }
 337     if (EVDW_PME(interactionConst.vdwtype))
 338     {
 339         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
 340     }
 341     if (bFreeEnergy_q || bFreeEnergy_lj)
 342     {
 343         /* Assumes that the B state flags are in the bits just above
 344          * the ones for the A state. */
 345         flags |= (flags << 1);
 346     }
 347
 348     gmx_pme_send_coeffs_coords(nullptr,
 349                                cr,
 350                                flags,
 351                                chargeA,
 352                                chargeB,
 353                                sqrt_c6A,
 354                                sqrt_c6B,
 355                                sigmaA,
 356                                sigmaB,
 357                                nullptr,
 358                                gmx::ArrayRef<gmx::RVec>(),
 359                                0,
 360                                0,
 361                                maxshift_x,
 362                                maxshift_y,
 363                                -1,
 364                                false,
 365                                false,
 366                                false,
 367                                false,
 368                                nullptr);
 369 }
 370
 371 void gmx_pme_send_coordinates(t_forcerec*                    fr,
 372                               const t_commrec*               cr,
 373                               const matrix                   box,
 374                               gmx::ArrayRef<const gmx::RVec> x,
 375                               real                           lambda_q,
 376                               real                           lambda_lj,
 377                               bool                           computeEnergyAndVirial,
 378                               int64_t                        step,
 379                               bool                           useGpuPmePpComms,
 380                               bool                           receiveCoordinateAddressFromPme,
 381                               bool                           sendCoordinatesFromGpu,
 382                               bool                           receiveForcesToGpu,
 383                               GpuEventSynchronizer*          coordinatesReadyOnDeviceEvent,
 384                               gmx_wallcycle*                 wcycle)
 385 {
 386     wallcycle_start(wcycle, WallCycleCounter::PpPmeSendX);
 387
 388     unsigned int flags = PP_PME_COORD;
 389     if (computeEnergyAndVirial)
 390     {
 391         flags |= PP_PME_ENER_VIR;
 392     }
 393     gmx_pme_send_coeffs_coords(fr,
 394                                cr,
 395                                flags,
 396                                {},
 397                                {},
 398                                {},
 399                                {},
 400                                {},
 401                                {},
 402                                box,
 403                                x,
 404                                lambda_q,
 405                                lambda_lj,
 406                                0,
 407                                0,
 408                                step,
 409                                useGpuPmePpComms,
 410                                receiveCoordinateAddressFromPme,
 411                                sendCoordinatesFromGpu,
 412                                receiveForcesToGpu,
 413                                coordinatesReadyOnDeviceEvent);
 414
 415     wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
 416 }
 417
 418 void gmx_pme_send_finish(const t_commrec* cr)
 419 {
 420     unsigned int flags = PP_PME_FINISH;
 421
 422     gmx_pme_send_coeffs_coords(
 423             nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, false, nullptr);
 424 }
 425
 426 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
 427 {
 428 #if GMX_MPI
 429     gmx_pme_comm_n_box_t cnb;
 430
 431     /* Only let one PP node signal each PME node */
 432     if (cr->dd->pme_receive_vir_ener)
 433     {
 434         cnb.flags = PP_PME_SWITCHGRID;
 435         copy_ivec(grid_size, cnb.grid_size);
 436         cnb.ewaldcoeff_q  = ewaldcoeff_q;
 437         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
 438
 439         /* We send this, uncommon, message blocking to simplify the code */
 440         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
 441     }
 442 #else
 443     GMX_UNUSED_VALUE(cr);
 444     GMX_UNUSED_VALUE(grid_size);
 445     GMX_UNUSED_VALUE(ewaldcoeff_q);
 446     GMX_UNUSED_VALUE(ewaldcoeff_lj);
 447 #endif
 448 }
 449
 450 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
 451 {
 452 #if GMX_MPI
 453     gmx_pme_comm_n_box_t cnb;
 454
 455     /* Only let one PP node signal each PME node */
 456     if (cr->dd->pme_receive_vir_ener)
 457     {
 458         cnb.flags = PP_PME_RESETCOUNTERS;
 459         cnb.step  = step;
 460
 461         /* We send this, uncommon, message blocking to simplify the code */
 462         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
 463     }
 464 #endif
 465 }
 466
 467 /*! \brief Receive virial and energy from PME rank */
 468 static void receive_virial_energy(const t_commrec*      cr,
 469                                   gmx::ForceWithVirial* forceWithVirial,
 470                                   real*                 energy_q,
 471                                   real*                 energy_lj,
 472                                   real*                 dvdlambda_q,
 473                                   real*                 dvdlambda_lj,
 474                                   float*                pme_cycles)
 475 {
 476     gmx_pme_comm_vir_ene_t cve;
 477
 478     if (cr->dd->pme_receive_vir_ener)
 479     {
 480         if (debug)
 481         {
 482             fprintf(debug,
 483                     "PP rank %d receiving from PME rank %d: virial and energy\n",
 484                     cr->sim_nodeid,
 485                     cr->dd->pme_nodeid);
 486         }
 487 #if GMX_MPI
 488         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
 489 #else
 490         memset(&cve, 0, sizeof(cve));
 491 #endif
 492
 493         forceWithVirial->addVirialContribution(cve.vir_q);
 494         forceWithVirial->addVirialContribution(cve.vir_lj);
 495         *energy_q  = cve.energy_q;
 496         *energy_lj = cve.energy_lj;
 497         *dvdlambda_q += cve.dvdlambda_q;
 498         *dvdlambda_lj += cve.dvdlambda_lj;
 499         *pme_cycles = cve.cycles;
 500
 501         if (cve.stop_cond != StopCondition::None)
 502         {
 503             gmx_set_stop_condition(cve.stop_cond);
 504         }
 505     }
 506     else
 507     {
 508         *energy_q   = 0;
 509         *energy_lj  = 0;
 510         *pme_cycles = 0;
 511     }
 512 }
 513
 514 /*! \brief Recieve force data from PME ranks */
 515 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
 516                          void*              recvptr,
 517                          int                n,
 518                          const t_commrec*   cr,
 519                          bool               useGpuPmePpComms,
 520                          bool               receivePmeForceToGpu)
 521 {
 522     if (useGpuPmePpComms)
 523     {
 524         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
 525         // Receive forces from PME rank
 526         pmePpCommGpu->receiveForceFromPme(static_cast<gmx::RVec*>(recvptr), n, receivePmeForceToGpu);
 527     }
 528     else
 529     {
 530         // Receive data using MPI
 531 #if GMX_MPI
 532         MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
 533 #else
 534         GMX_UNUSED_VALUE(cr);
 535         GMX_UNUSED_VALUE(n);
 536 #endif
 537     }
 538 }
 539
 540
 541 void gmx_pme_receive_f(gmx::PmePpCommGpu*    pmePpCommGpu,
 542                        const t_commrec*      cr,
 543                        gmx::ForceWithVirial* forceWithVirial,
 544                        real*                 energy_q,
 545                        real*                 energy_lj,
 546                        real*                 dvdlambda_q,
 547                        real*                 dvdlambda_lj,
 548                        bool                  useGpuPmePpComms,
 549                        bool                  receivePmeForceToGpu,
 550                        float*                pme_cycles)
 551 {
 552     if (c_useDelayedWait)
 553     {
 554         /* Wait for the x request to finish */
 555         gmx_pme_send_coeffs_coords_wait(cr->dd);
 556     }
 557
 558     const int               natoms = dd_numHomeAtoms(*cr->dd);
 559     std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
 560     buffer.resize(natoms);
 561
 562     void* recvptr = reinterpret_cast<void*>(buffer.data());
 563     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
 564
 565     int nt = gmx_omp_nthreads_get_simple_rvec_task(ModuleMultiThread::Default, natoms);
 566
 567     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
 568
 569     if (!receivePmeForceToGpu)
 570     {
 571         /* Note that we would like to avoid this conditional by putting it
 572          * into the omp pragma instead, but then we still take the full
 573          * omp parallel for overhead (at least with gcc5).
 574          */
 575         if (nt == 1)
 576         {
 577             for (int i = 0; i < natoms; i++)
 578             {
 579                 f[i] += buffer[i];
 580             }
 581         }
 582         else
 583         {
 584 #pragma omp parallel for num_threads(nt) schedule(static)
 585             for (int i = 0; i < natoms; i++)
 586             {
 587                 f[i] += buffer[i];
 588             }
 589         }
 590     }
 591
 592     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
 593 }