src/gromacs/ewald/pme-gpu.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  * \brief Implements high-level PME GPU functions which do not require GPU framework-specific code.
  38  *
  39  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  40  * \ingroup module_ewald
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #include <list>
  48
  49 #include "gromacs/ewald/pme.h"
  50 #include "gromacs/fft/parallel_3dfft.h"
  51 #include "gromacs/math/invertmatrix.h"
  52 #include "gromacs/mdtypes/inputrec.h"
  53 #include "gromacs/utility/exceptions.h"
  54 #include "gromacs/utility/fatalerror.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/stringutil.h"
  57
  58 #include "pme-gpu-internal.h"
  59 #include "pme-grid.h"
  60 #include "pme-internal.h"
  61 #include "pme-solve.h"
  62
  63 PmeRunMode pme_run_mode(const gmx_pme_t *pme)
  64 {
  65     GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer");
  66     return pme->runMode;
  67 }
  68
  69 bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
  70 {
  71     std::list<std::string> errorReasons;
  72     if (!EEL_PME(ir->coulombtype))
  73     {
  74         errorReasons.push_back("systems that do not use PME for electrostatics");
  75     }
  76     if (ir->pme_order != 4)
  77     {
  78         errorReasons.push_back("interpolation orders other than 4");
  79     }
  80     if (ir->efep != efepNO)
  81     {
  82         errorReasons.push_back("free energy calculations (multiple grids)");
  83     }
  84     if (EVDW_PME(ir->vdwtype))
  85     {
  86         errorReasons.push_back("Lennard-Jones PME");
  87     }
  88 #if GMX_DOUBLE
  89     {
  90         errorReasons.push_back("double precision");
  91     }
  92 #endif
  93 #if GMX_GPU != GMX_GPU_CUDA
  94     {
  95         errorReasons.push_back("non-CUDA build of GROMACS");
  96     }
  97 #endif
  98     if (ir->cutoff_scheme == ecutsGROUP)
  99     {
 100         errorReasons.push_back("group cutoff scheme");
 101     }
 102     if (EI_TPI(ir->eI))
 103     {
 104         errorReasons.push_back("test particle insertion");
 105     }
 106
 107     bool inputSupported = errorReasons.empty();
 108     if (!inputSupported && error)
 109     {
 110         std::string regressionTestMarker = "PME GPU does not support";
 111         // this prefix is tested for in the regression tests script gmxtest.pl
 112         *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
 113     }
 114     return inputSupported;
 115 }
 116
 117 void pme_gpu_reset_timings(const gmx_pme_t *pme)
 118 {
 119     if (pme_gpu_active(pme))
 120     {
 121         pme_gpu_reset_timings(pme->gpu);
 122     }
 123 }
 124
 125 void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings)
 126 {
 127     if (pme_gpu_active(pme))
 128     {
 129         pme_gpu_get_timings(pme->gpu, timings);
 130     }
 131 }
 132
 133 /*! \brief
 134  * A convenience wrapper for launching either the GPU or CPU FFT.
 135  *
 136  * \param[in] pme            The PME structure.
 137  * \param[in] gridIndex      The grid index - should currently always be 0.
 138  * \param[in] dir            The FFT direction enum.
 139  * \param[in] wcycle         The wallclock counter.
 140  */
 141 void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t              *pme,
 142                                                const int               gridIndex,
 143                                                enum gmx_fft_direction  dir,
 144                                                gmx_wallcycle_t         wcycle)
 145 {
 146     GMX_ASSERT(gridIndex == 0, "Only single grid supported");
 147     if (pme_gpu_performs_FFT(pme->gpu))
 148     {
 149         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
 150         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
 151         pme_gpu_3dfft(pme->gpu, dir, gridIndex);
 152         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
 153         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 154     }
 155     else
 156     {
 157         wallcycle_start(wcycle, ewcPME_FFT_MIXED_MODE);
 158 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 159         for (int thread = 0; thread < pme->nthread; thread++)
 160         {
 161             gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle);
 162         }
 163         wallcycle_stop(wcycle, ewcPME_FFT_MIXED_MODE);
 164     }
 165 }
 166
 167 /* The PME computation code split into a few separate functions. */
 168
 169 void pme_gpu_prepare_computation(gmx_pme_t            *pme,
 170                                  bool                  needToUpdateBox,
 171                                  const matrix          box,
 172                                  gmx_wallcycle_t       wcycle,
 173                                  int                   flags)
 174 {
 175     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 176     GMX_ASSERT(pme->nnodes > 0, "");
 177     GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, "");
 178
 179     PmeGpu *pmeGpu = pme->gpu;
 180     pmeGpu->settings.currentFlags = flags;
 181     // TODO these flags are only here to honor the CPU PME code, and probably should be removed
 182
 183     bool shouldUpdateBox = false;
 184     for (int i = 0; i < DIM; ++i)
 185     {
 186         for (int j = 0; j <= i; ++j)
 187         {
 188             shouldUpdateBox                  |= (pmeGpu->common->previousBox[i][j] != box[i][j]);
 189             pmeGpu->common->previousBox[i][j] = box[i][j];
 190         }
 191     }
 192
 193     if (needToUpdateBox || shouldUpdateBox) // || is to make the first computation always update
 194     {
 195         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
 196         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
 197         pme_gpu_update_input_box(pmeGpu, box);
 198         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
 199         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 200
 201         if (!pme_gpu_performs_solve(pmeGpu))
 202         {
 203             // TODO this has already been computed in pme->gpu
 204             //memcpy(pme->recipbox, pme->gpu->common->
 205             gmx::invertBoxMatrix(box, pme->recipbox);
 206             // FIXME verify that the box is scaled correctly on GPU codepath
 207             pme->boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ];
 208         }
 209     }
 210 }
 211
 212
 213 void pme_gpu_launch_spread(gmx_pme_t            *pme,
 214                            const rvec           *x,
 215                            gmx_wallcycle_t       wcycle)
 216 {
 217     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 218
 219     PmeGpu *pmeGpu = pme->gpu;
 220
 221     // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count
 222     wallcycle_start(wcycle, ewcLAUNCH_GPU);
 223     wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
 224     pme_gpu_copy_input_coordinates(pmeGpu, x);
 225     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
 226     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 227
 228     const unsigned int gridIndex  = 0;
 229     real              *fftgrid    = pme->fftgrid[gridIndex];
 230     if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD)
 231     {
 232         /* Spread the coefficients on a grid */
 233         const bool computeSplines = true;
 234         const bool spreadCharges  = true;
 235         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
 236         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
 237         pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
 238         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
 239         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 240     }
 241 }
 242
 243 void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
 244                                        gmx_wallcycle_t wcycle)
 245 {
 246     PmeGpu            *pmeGpu                 = pme->gpu;
 247     const bool         computeEnergyAndVirial = pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
 248     const bool         performBackFFT         = pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
 249     const unsigned int gridIndex              = 0;
 250     t_complex         *cfftgrid               = pme->cfftgrid[gridIndex];
 251
 252     if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD)
 253     {
 254         if (!pme_gpu_performs_FFT(pmeGpu))
 255         {
 256             wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD);
 257             pme_gpu_sync_spread_grid(pme->gpu);
 258             wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD);
 259         }
 260     }
 261
 262     try
 263     {
 264         if (pmeGpu->settings.currentFlags & GMX_PME_SOLVE)
 265         {
 266             /* do R2C 3D-FFT */
 267             parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
 268
 269             /* solve in k-space for our local cells */
 270             if (pme_gpu_performs_solve(pmeGpu))
 271             {
 272                 const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ;
 273                 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
 274                 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount
 275                 pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
 276                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
 277                 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 278             }
 279             else
 280             {
 281                 wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
 282 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
 283                 for (int thread = 0; thread < pme->nthread; thread++)
 284                 {
 285                     solve_pme_yzx(pme, cfftgrid, pme->boxVolume,
 286                                   computeEnergyAndVirial, pme->nthread, thread);
 287                 }
 288                 wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
 289             }
 290         }
 291
 292         if (performBackFFT)
 293         {
 294             parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
 295         }
 296     } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 297 }
 298
 299 void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
 300                            gmx_wallcycle_t gmx_unused       wcycle,
 301                            PmeForceOutputHandling           forceTreatment)
 302 {
 303     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 304
 305     if (!pme_gpu_performs_gather(pme->gpu))
 306     {
 307         return;
 308     }
 309
 310     wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
 311     wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
 312     const unsigned int gridIndex  = 0;
 313     real              *fftgrid    = pme->fftgrid[gridIndex];
 314     pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
 315     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
 316     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 317 }
 318
 319 /*! \brief Reduce staged virial and energy outputs.
 320  *
 321  * \param[in]  pme            The PME data structure.
 322  * \param[out] forces         Output forces pointer, the internal ArrayRef pointers gets assigned to it.
 323  * \param[out] virial         The output virial matrix.
 324  * \param[out] energy         The output energy.
 325  */
 326 static void pme_gpu_get_staged_results(const gmx_pme_t                *pme,
 327                                        gmx::ArrayRef<const gmx::RVec> *forces,
 328                                        matrix                          virial,
 329                                        real                           *energy)
 330 {
 331     const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
 332     *forces = pme_gpu_get_forces(pme->gpu);
 333
 334     if (haveComputedEnergyAndVirial)
 335     {
 336         if (pme_gpu_performs_solve(pme->gpu))
 337         {
 338             pme_gpu_get_energy_virial(pme->gpu, energy, virial);
 339         }
 340         else
 341         {
 342             get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy, virial);
 343         }
 344     }
 345 }
 346
 347 bool pme_gpu_try_finish_task(const gmx_pme_t                *pme,
 348                              gmx_wallcycle_t                 wcycle,
 349                              gmx::ArrayRef<const gmx::RVec> *forces,
 350                              matrix                          virial,
 351                              real                           *energy,
 352                              GpuTaskCompletion               completionKind)
 353 {
 354     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 355
 356     wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER);
 357
 358     if (completionKind == GpuTaskCompletion::Check)
 359     {
 360         // Query the PME stream for completion of all tasks enqueued and
 361         // if we're not done, stop the timer before early return.
 362         if (!pme_gpu_stream_query(pme->gpu))
 363         {
 364             wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 365             return false;
 366         }
 367     }
 368     else
 369     {
 370         // Synchronize the whole PME stream at once, including D2H result transfers.
 371         pme_gpu_synchronize(pme->gpu);
 372     }
 373     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 374
 375     // Time the final staged data handling separately with a counting call to get
 376     // the call count right.
 377     wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
 378
 379     // The computation has completed, do timing accounting and resetting buffers
 380     pme_gpu_update_timings(pme->gpu);
 381     // TODO: move this later and launch it together with the other
 382     // non-bonded tasks at the end of the step
 383     pme_gpu_reinit_computation(pme->gpu);
 384
 385     pme_gpu_get_staged_results(pme, forces, virial, energy);
 386
 387     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 388
 389     return true;
 390 }
 391
 392 void pme_gpu_wait_finish_task(const gmx_pme_t                *pme,
 393                               gmx_wallcycle_t                 wcycle,
 394                               gmx::ArrayRef<const gmx::RVec> *forces,
 395                               matrix                          virial,
 396                               real                           *energy)
 397 {
 398     pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);
 399 }