src/gromacs/ewald/pme_gpu.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  * \brief Implements high-level PME GPU functions which do not require GPU framework-specific code.
  38  *
  39  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  40  * \ingroup module_ewald
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #include <list>
  48
  49 #include "gromacs/ewald/ewald_utils.h"
  50 #include "gromacs/ewald/pme.h"
  51 #include "gromacs/fft/parallel_3dfft.h"
  52 #include "gromacs/math/invertmatrix.h"
  53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  54 #include "gromacs/mdtypes/enerdata.h"
  55 #include "gromacs/mdtypes/forceoutput.h"
  56 #include "gromacs/mdtypes/inputrec.h"
  57 #include "gromacs/mdtypes/simulation_workload.h"
  58 #include "gromacs/utility/exceptions.h"
  59 #include "gromacs/utility/fatalerror.h"
  60 #include "gromacs/utility/gmxassert.h"
  61 #include "gromacs/utility/stringutil.h"
  62
  63 #include "pme_gpu_internal.h"
  64 #include "pme_gpu_settings.h"
  65 #include "pme_gpu_timings.h"
  66 #include "pme_gpu_types_host.h"
  67 #include "pme_grid.h"
  68 #include "pme_internal.h"
  69 #include "pme_solve.h"
  70
  71 /*! \brief
  72  * Finds out if PME is currently running on GPU.
  73  *
  74  * \todo The GPU module should not be constructed (or at least called)
  75  * when it is not active, so there should be no need to check whether
  76  * it is active. An assertion that this is true makes sense.
  77  *
  78  * \param[in] pme  The PME structure.
  79  * \returns        True if PME runs on GPU currently, false otherwise.
  80  */
  81 static inline bool pme_gpu_active(const gmx_pme_t* pme)
  82 {
  83     return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
  84 }
  85
  86 void pme_gpu_reset_timings(const gmx_pme_t* pme)
  87 {
  88     if (pme_gpu_active(pme))
  89     {
  90         pme_gpu_reset_timings(pme->gpu);
  91     }
  92 }
  93
  94 void pme_gpu_get_timings(const gmx_pme_t* pme, gmx_wallclock_gpu_pme_t* timings)
  95 {
  96     if (pme_gpu_active(pme))
  97     {
  98         pme_gpu_get_timings(pme->gpu, timings);
  99     }
 100 }
 101
 102 int pme_gpu_get_block_size(const gmx_pme_t* pme)
 103 {
 104
 105     if (!pme || !pme_gpu_active(pme))
 106     {
 107         return 0;
 108     }
 109     else
 110     {
 111         return pme_gpu_get_atom_data_block_size();
 112     }
 113 }
 114
 115 /*! \brief
 116  * A convenience wrapper for launching either the GPU or CPU FFT.
 117  *
 118  * \param[in] pme            The PME structure.
 119  * \param[in] gridIndex      The grid index - should currently always be 0.
 120  * \param[in] dir            The FFT direction enum.
 121  * \param[in] wcycle         The wallclock counter.
 122  */
 123 void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t*             pme,
 124                                                const int              gridIndex,
 125                                                enum gmx_fft_direction dir,
 126                                                gmx_wallcycle*         wcycle)
 127 {
 128     if (pme_gpu_settings(pme->gpu).performGPUFFT)
 129     {
 130         wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
 131         wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 132         pme_gpu_3dfft(pme->gpu, dir, gridIndex);
 133         wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
 134         wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 135     }
 136     else
 137     {
 138         wallcycle_start(wcycle, WallCycleCounter::PmeFftMixedMode);
 139 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 140         for (int thread = 0; thread < pme->nthread; thread++)
 141         {
 142             gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle);
 143         }
 144         wallcycle_stop(wcycle, WallCycleCounter::PmeFftMixedMode);
 145     }
 146 }
 147
 148 /* The PME computation code split into a few separate functions. */
 149
 150 void pme_gpu_prepare_computation(gmx_pme_t*               pme,
 151                                  const matrix             box,
 152                                  gmx_wallcycle*           wcycle,
 153                                  const gmx::StepWorkload& stepWork)
 154 {
 155     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 156     GMX_ASSERT(pme->nnodes > 0, "");
 157     GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, "");
 158
 159     PmeGpu* pmeGpu = pme->gpu;
 160     // TODO these flags are only here to honor the CPU PME code, and probably should be removed
 161     pmeGpu->settings.useGpuForceReduction = stepWork.useGpuPmeFReduction;
 162
 163     bool shouldUpdateBox = false;
 164     for (int i = 0; i < DIM; ++i)
 165     {
 166         for (int j = 0; j <= i; ++j)
 167         {
 168             shouldUpdateBox |= (pmeGpu->common->previousBox[i][j] != box[i][j]);
 169             pmeGpu->common->previousBox[i][j] = box[i][j];
 170         }
 171     }
 172
 173     if (stepWork.haveDynamicBox || shouldUpdateBox) // || is to make the first computation always update
 174     {
 175         wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
 176         wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 177         pme_gpu_update_input_box(pmeGpu, box);
 178         wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
 179         wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 180
 181         if (!pme_gpu_settings(pmeGpu).performGPUSolve)
 182         {
 183             // TODO remove code duplication and add test coverage
 184             matrix scaledBox;
 185             pmeGpu->common->boxScaler->scaleBox(box, scaledBox);
 186             gmx::invertBoxMatrix(scaledBox, pme->recipbox);
 187             pme->boxVolume = scaledBox[XX][XX] * scaledBox[YY][YY] * scaledBox[ZZ][ZZ];
 188         }
 189     }
 190 }
 191
 192 void pme_gpu_launch_spread(gmx_pme_t*            pme,
 193                            GpuEventSynchronizer* xReadyOnDevice,
 194                            gmx_wallcycle*        wcycle,
 195                            const real            lambdaQ)
 196 {
 197     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 198     GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
 199                "Need a valid xReadyOnDevice on PP+PME ranks with CUDA.");
 200     GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
 201
 202     PmeGpu* pmeGpu = pme->gpu;
 203
 204     GMX_ASSERT(pmeGpu->common->ngrids == 1 || (pmeGpu->common->ngrids == 2 && pme->bFEP_q),
 205                "If not decoupling Coulomb interactions there should only be one FEP grid. If "
 206                "decoupling Coulomb interactions there should be two grids.");
 207
 208     /* PME on GPU can currently manage two grids:
 209      * grid_index=0: Coulomb PME with charges in the normal state or from FEP state A.
 210      * grid_index=1: Coulomb PME with charges from FEP state B.
 211      */
 212     real** fftgrids = pme->fftgrid;
 213     /* Spread the coefficients on a grid */
 214     const bool computeSplines = true;
 215     const bool spreadCharges  = true;
 216     wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
 217     wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 218     pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
 219     wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
 220     wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 221 }
 222
 223 void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork)
 224 {
 225     PmeGpu*     pmeGpu   = pme->gpu;
 226     const auto& settings = pmeGpu->settings;
 227     // There's no support for computing energy without virial, or vice versa
 228     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
 229     if (!settings.performGPUFFT)
 230     {
 231         wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeSpread);
 232         pme_gpu_sync_spread_grid(pme->gpu);
 233         wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeSpread);
 234     }
 235
 236     try
 237     {
 238         /* The 3dffts and the solve are done in a loop to simplify things, even if this means that
 239          * there will be two kernel launches for solve. */
 240         for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
 241         {
 242             /* do R2C 3D-FFT */
 243             t_complex* cfftgrid = pme->cfftgrid[gridIndex];
 244             parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
 245
 246             /* solve in k-space for our local cells */
 247             if (settings.performGPUSolve)
 248             {
 249                 const auto gridOrdering =
 250                         settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ;
 251                 wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
 252                 wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 253                 pme_gpu_solve(pmeGpu, gridIndex, cfftgrid, gridOrdering, computeEnergyAndVirial);
 254                 wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
 255                 wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 256             }
 257             else
 258             {
 259                 wallcycle_start(wcycle, WallCycleCounter::PmeSolveMixedMode);
 260 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 261                 for (int thread = 0; thread < pme->nthread; thread++)
 262                 {
 263                     solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread);
 264                 }
 265                 wallcycle_stop(wcycle, WallCycleCounter::PmeSolveMixedMode);
 266             }
 267
 268             parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
 269         }
 270     }
 271     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 272 }
 273
 274 void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycle, const real lambdaQ)
 275 {
 276     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 277
 278     if (!pme_gpu_settings(pme->gpu).performGPUGather)
 279     {
 280         return;
 281     }
 282
 283     wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
 284     wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 285
 286     float** fftgrids = pme->fftgrid;
 287     pme_gpu_gather(pme->gpu, fftgrids, lambdaQ);
 288     wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
 289     wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 290 }
 291
 292 //! Accumulate the \c forcesToAdd to \c f, using the available threads.
 293 static void sum_forces(gmx::ArrayRef<gmx::RVec> f, gmx::ArrayRef<const gmx::RVec> forceToAdd)
 294 {
 295     const int end = forceToAdd.size();
 296
 297     int gmx_unused nt = gmx_omp_nthreads_get(ModuleMultiThread::Pme);
 298 #pragma omp parallel for num_threads(nt) schedule(static)
 299     for (int i = 0; i < end; i++)
 300     {
 301         f[i] += forceToAdd[i];
 302     }
 303 }
 304
 305 //! Reduce quantities from \c output to \c forceWithVirial and \c enerd.
 306 static void pme_gpu_reduce_outputs(const bool            computeEnergyAndVirial,
 307                                    const PmeOutput&      output,
 308                                    gmx_wallcycle*        wcycle,
 309                                    gmx::ForceWithVirial* forceWithVirial,
 310                                    gmx_enerdata_t*       enerd)
 311 {
 312     wallcycle_start(wcycle, WallCycleCounter::PmeGpuFReduction);
 313     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 314
 315     if (computeEnergyAndVirial)
 316     {
 317         GMX_ASSERT(enerd, "Invalid energy output manager");
 318         forceWithVirial->addVirialContribution(output.coulombVirial_);
 319         enerd->term[F_COUL_RECIP] += output.coulombEnergy_;
 320         enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Coul] += output.coulombDvdl_;
 321     }
 322     if (output.haveForceOutput_)
 323     {
 324         sum_forces(forceWithVirial->force_, output.forces_);
 325     }
 326     wallcycle_stop(wcycle, WallCycleCounter::PmeGpuFReduction);
 327 }
 328
 329 bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
 330                              const gmx::StepWorkload& stepWork,
 331                              gmx_wallcycle*           wcycle,
 332                              gmx::ForceWithVirial*    forceWithVirial,
 333                              gmx_enerdata_t*          enerd,
 334                              const real               lambdaQ,
 335                              GpuTaskCompletion        completionKind)
 336 {
 337     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 338     GMX_ASSERT(!pme->gpu->settings.useGpuForceReduction,
 339                "GPU force reduction should not be active on the pme_gpu_try_finish_task() path");
 340
 341     // First, if possible, check whether all tasks on the stream have
 342     // completed, and return fast if not. Accumulate to wcycle the
 343     // time needed for that checking, but do not yet record that the
 344     // gather has occured.
 345     bool           needToSynchronize      = true;
 346     constexpr bool c_streamQuerySupported = GMX_GPU_CUDA;
 347
 348     // TODO: implement c_streamQuerySupported with an additional GpuEventSynchronizer per stream (#2521)
 349     if ((completionKind == GpuTaskCompletion::Check) && c_streamQuerySupported)
 350     {
 351         wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuPmeGather);
 352         // Query the PME stream for completion of all tasks enqueued and
 353         // if we're not done, stop the timer before early return.
 354         const bool pmeGpuDone = pme_gpu_stream_query(pme->gpu);
 355         wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
 356
 357         if (!pmeGpuDone)
 358         {
 359             return false;
 360         }
 361         needToSynchronize = false;
 362     }
 363
 364     wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather);
 365     // If the above check passed, then there is no need to make an
 366     // explicit synchronization call.
 367     if (needToSynchronize)
 368     {
 369         // Synchronize the whole PME stream at once, including D2H result transfers.
 370         pme_gpu_synchronize(pme->gpu);
 371     }
 372     pme_gpu_update_timings(pme->gpu);
 373     // There's no support for computing energy without virial, or vice versa
 374     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
 375     PmeOutput  output                 = pme_gpu_getOutput(
 376             *pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
 377     wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
 378
 379     GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
 380                "When forces are reduced on the CPU, there needs to be force output");
 381     pme_gpu_reduce_outputs(computeEnergyAndVirial, output, wcycle, forceWithVirial, enerd);
 382
 383     return true;
 384 }
 385
 386 // This is used by PME-only ranks
 387 PmeOutput pme_gpu_wait_finish_task(gmx_pme_t*     pme,
 388                                    const bool     computeEnergyAndVirial,
 389                                    const real     lambdaQ,
 390                                    gmx_wallcycle* wcycle)
 391 {
 392     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 393
 394     wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather);
 395
 396     // Synchronize the whole PME stream at once, including D2H result transfers
 397     // if there are outputs we need to wait for at this step; we still call getOutputs
 398     // for uniformity and because it sets the PmeOutput.haveForceOutput_.
 399     if (!pme->gpu->settings.useGpuForceReduction || computeEnergyAndVirial)
 400     {
 401         pme_gpu_synchronize(pme->gpu);
 402     }
 403
 404     PmeOutput output = pme_gpu_getOutput(
 405             *pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
 406     wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
 407     return output;
 408 }
 409
 410 // This is used when not using the alternate-waiting reduction
 411 void pme_gpu_wait_and_reduce(gmx_pme_t*               pme,
 412                              const gmx::StepWorkload& stepWork,
 413                              gmx_wallcycle*           wcycle,
 414                              gmx::ForceWithVirial*    forceWithVirial,
 415                              gmx_enerdata_t*          enerd,
 416                              const real               lambdaQ)
 417 {
 418     // There's no support for computing energy without virial, or vice versa
 419     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
 420     PmeOutput  output                 = pme_gpu_wait_finish_task(
 421             pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0, wcycle);
 422     GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
 423                "When forces are reduced on the CPU, there needs to be force output");
 424     pme_gpu_reduce_outputs(computeEnergyAndVirial, output, wcycle, forceWithVirial, enerd);
 425 }
 426
 427 void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle)
 428 {
 429     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 430
 431     wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
 432     wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 433
 434     pme_gpu_update_timings(pme->gpu);
 435
 436     pme_gpu_clear_grids(pme->gpu);
 437     pme_gpu_clear_energy_virial(pme->gpu);
 438
 439     wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
 440     wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 441 }
 442
 443 DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* pme)
 444 {
 445     if (!pme || !pme_gpu_active(pme))
 446     {
 447         return DeviceBuffer<gmx::RVec>{};
 448     }
 449     return pme_gpu_get_kernelparam_forces(pme->gpu);
 450 }
 451
 452 void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
 453 {
 454     GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
 455     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 456
 457     pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
 458 }
 459
 460 GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* pme)
 461 {
 462     if (!pme || !pme_gpu_active(pme))
 463     {
 464         return nullptr;
 465     }
 466
 467     return pme_gpu_get_forces_ready_synchronizer(pme->gpu);
 468 }