src/gromacs/ewald/tests/pmetestcommon.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
   5  * Copyright (c) 2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  * \brief
  38  * Implements common routines for PME tests.
  39  *
  40  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  41  * \ingroup module_ewald
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "pmetestcommon.h"
  46
  47 #include <cstring>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec.h"
  52 #include "gromacs/ewald/pme_gather.h"
  53 #include "gromacs/ewald/pme_gpu_calculate_splines.h"
  54 #include "gromacs/ewald/pme_gpu_constants.h"
  55 #include "gromacs/ewald/pme_gpu_internal.h"
  56 #include "gromacs/ewald/pme_gpu_staging.h"
  57 #include "gromacs/ewald/pme_grid.h"
  58 #include "gromacs/ewald/pme_internal.h"
  59 #include "gromacs/ewald/pme_redistribute.h"
  60 #include "gromacs/ewald/pme_solve.h"
  61 #include "gromacs/ewald/pme_spread.h"
  62 #include "gromacs/fft/parallel_3dfft.h"
  63 #include "gromacs/gpu_utils/gpu_utils.h"
  64 #include "gromacs/hardware/device_management.h"
  65 #include "gromacs/math/invertmatrix.h"
  66 #include "gromacs/mdtypes/commrec.h"
  67 #include "gromacs/pbcutil/pbc.h"
  68 #include "gromacs/topology/topology.h"
  69 #include "gromacs/utility/exceptions.h"
  70 #include "gromacs/utility/gmxassert.h"
  71 #include "gromacs/utility/logger.h"
  72 #include "gromacs/utility/stringutil.h"
  73 #include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  74
  75 #include "testutils/test_hardware_environment.h"
  76 #include "testutils/testasserts.h"
  77
  78 class DeviceContext;
  79
  80 namespace gmx
  81 {
  82 namespace test
  83 {
  84
  85 //! A couple of valid inputs for boxes.
  86 const std::map<std::string, Matrix3x3> c_inputBoxes = {
  87     { "rect", { { 8.0F, 0.0F, 0.0F, 0.0F, 3.4F, 0.0F, 0.0F, 0.0F, 2.0F } } },
  88     { "tric", { { 7.0F, 0.0F, 0.0F, 0.0F, 4.1F, 0.0F, 3.5F, 2.0F, 12.2F } } },
  89 };
  90
  91 //! Valid PME orders for testing
  92 std::vector<int> c_inputPmeOrders{ 3, 4, 5 };
  93
  94 bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode)
  95 {
  96     bool implemented;
  97     switch (mode)
  98     {
  99         case CodePath::CPU: implemented = true; break;
 100
 101         case CodePath::GPU:
 102             implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr)
 103                            && pme_gpu_supports_input(*inputRec, nullptr));
 104             break;
 105
 106         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 107     }
 108     return implemented;
 109 }
 110
 111 uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
 112 {
 113     /* Arbitrary ulp tolerance for sine/cosine implementation. It's
 114      * hard to know what to pick without testing lots of
 115      * implementations. */
 116     const uint64_t sineUlps = 10;
 117     return 4 * (splineOrder - 2) + 2 * sineUlps * splineOrder;
 118 }
 119
 120 //! PME initialization
 121 PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
 122                               const CodePath       mode,
 123                               const DeviceContext* deviceContext,
 124                               const DeviceStream*  deviceStream,
 125                               const PmeGpuProgram* pmeGpuProgram,
 126                               const Matrix3x3&     box,
 127                               const real           ewaldCoeff_q,
 128                               const real           ewaldCoeff_lj)
 129 {
 130     const MDLogger dummyLogger;
 131     const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
 132     t_commrec      dummyCommrec  = { 0 };
 133     NumPmeDomains  numPmeDomains = { 1, 1 };
 134     gmx_pme_t*     pmeDataRaw    = gmx_pme_init(&dummyCommrec,
 135                                          numPmeDomains,
 136                                          inputRec,
 137                                          false,
 138                                          false,
 139                                          true,
 140                                          ewaldCoeff_q,
 141                                          ewaldCoeff_lj,
 142                                          1,
 143                                          runMode,
 144                                          nullptr,
 145                                          deviceContext,
 146                                          deviceStream,
 147                                          pmeGpuProgram,
 148                                          dummyLogger);
 149     PmeSafePointer pme(pmeDataRaw); // taking ownership
 150
 151     // TODO get rid of this with proper matrix type
 152     matrix boxTemp;
 153     for (int i = 0; i < DIM; i++)
 154     {
 155         for (int j = 0; j < DIM; j++)
 156         {
 157             boxTemp[i][j] = box[i * DIM + j];
 158         }
 159     }
 160     const char* boxError = check_box(PbcType::Unset, boxTemp);
 161     GMX_RELEASE_ASSERT(boxError == nullptr, boxError);
 162
 163     switch (mode)
 164     {
 165         case CodePath::CPU: invertBoxMatrix(boxTemp, pme->recipbox); break;
 166
 167         case CodePath::GPU:
 168             pme_gpu_set_testing(pme->gpu, true);
 169             pme_gpu_update_input_box(pme->gpu, boxTemp);
 170             break;
 171
 172         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 173     }
 174
 175     return pme;
 176 }
 177
 178 PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
 179 {
 180     const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
 181     return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
 182 }
 183
 184 //! Make a GPU state-propagator manager
 185 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
 186                                                                    const DeviceContext* deviceContext,
 187                                                                    const DeviceStream* deviceStream)
 188 {
 189     // TODO: Pin the host buffer and use async memory copies
 190     // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
 191     //       restrict one from using other constructor here.
 192     return std::make_unique<StatePropagatorDataGpu>(
 193             deviceStream, *deviceContext, GpuApiCallBehavior::Sync, pme_gpu_get_block_size(&pme), nullptr);
 194 }
 195
 196 //! PME initialization with atom data
 197 void pmeInitAtoms(gmx_pme_t*               pme,
 198                   StatePropagatorDataGpu*  stateGpu,
 199                   const CodePath           mode,
 200                   const CoordinatesVector& coordinates,
 201                   const ChargesVector&     charges)
 202 {
 203     const index atomCount = coordinates.size();
 204     GMX_RELEASE_ASSERT(atomCount == gmx::ssize(charges), "Mismatch in atom data");
 205     PmeAtomComm* atc = nullptr;
 206
 207     switch (mode)
 208     {
 209         case CodePath::CPU:
 210             atc              = &(pme->atc[0]);
 211             atc->x           = coordinates;
 212             atc->coefficient = charges;
 213             gmx_pme_reinit_atoms(pme, atomCount, charges, {});
 214             /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
 215             break;
 216
 217         case CodePath::GPU:
 218             // TODO: Avoid use of atc in the GPU code path
 219             atc = &(pme->atc[0]);
 220             // We need to set atc->n for passing the size in the tests
 221             atc->setNumAtoms(atomCount);
 222             gmx_pme_reinit_atoms(pme, atomCount, charges, {});
 223
 224             stateGpu->reinit(atomCount, atomCount);
 225             stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()),
 226                                            gmx::AtomLocality::Local);
 227             pme_gpu_set_kernelparam_coordinates(pme->gpu, stateGpu->getCoordinates());
 228
 229             break;
 230
 231         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 232     }
 233 }
 234
 235 //! Getting local PME real grid pointer for test I/O
 236 static real* pmeGetRealGridInternal(const gmx_pme_t* pme)
 237 {
 238     const size_t gridIndex = 0;
 239     return pme->fftgrid[gridIndex];
 240 }
 241
 242 //! Getting local PME real grid dimensions
 243 static void pmeGetRealGridSizesInternal(const gmx_pme_t* pme,
 244                                         CodePath         mode,
 245                                         IVec& gridSize,       //NOLINT(google-runtime-references)
 246                                         IVec& paddedGridSize) //NOLINT(google-runtime-references)
 247 {
 248     const size_t gridIndex = 0;
 249     IVec         gridOffsetUnused;
 250     switch (mode)
 251     {
 252         case CodePath::CPU:
 253             gmx_parallel_3dfft_real_limits(
 254                     pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
 255             break;
 256
 257         case CodePath::GPU:
 258             pme_gpu_get_real_grid_sizes(pme->gpu, &gridSize, &paddedGridSize);
 259             break;
 260
 261         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 262     }
 263 }
 264
 265 //! Getting local PME complex grid pointer for test I/O
 266 static t_complex* pmeGetComplexGridInternal(const gmx_pme_t* pme)
 267 {
 268     const size_t gridIndex = 0;
 269     return pme->cfftgrid[gridIndex];
 270 }
 271
 272 //! Getting local PME complex grid dimensions
 273 static void pmeGetComplexGridSizesInternal(const gmx_pme_t* pme,
 274                                            IVec& gridSize,       //NOLINT(google-runtime-references)
 275                                            IVec& paddedGridSize) //NOLINT(google-runtime-references)
 276 {
 277     const size_t gridIndex = 0;
 278     IVec         gridOffsetUnused, complexOrderUnused;
 279     gmx_parallel_3dfft_complex_limits(
 280             pme->pfft_setup[gridIndex], complexOrderUnused, gridSize, gridOffsetUnused, paddedGridSize); // TODO: what about YZX ordering?
 281 }
 282
 283 //! Getting the PME grid memory buffer and its sizes - template definition
 284 template<typename ValueType>
 285 static void pmeGetGridAndSizesInternal(const gmx_pme_t* /*unused*/,
 286                                        CodePath /*unused*/,
 287                                        ValueType*& /*unused*/, //NOLINT(google-runtime-references)
 288                                        IVec& /*unused*/,       //NOLINT(google-runtime-references)
 289                                        IVec& /*unused*/)       //NOLINT(google-runtime-references)
 290 {
 291     GMX_THROW(InternalError("Deleted function call"));
 292     // explicitly deleting general template does not compile in clang, see https://llvm.org/bugs/show_bug.cgi?id=17537
 293 }
 294
 295 //! Getting the PME real grid memory buffer and its sizes
 296 template<>
 297 void pmeGetGridAndSizesInternal<real>(const gmx_pme_t* pme, CodePath mode, real*& grid, IVec& gridSize, IVec& paddedGridSize)
 298 {
 299     grid = pmeGetRealGridInternal(pme);
 300     pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSize);
 301 }
 302
 303 //! Getting the PME complex grid memory buffer and its sizes
 304 template<>
 305 void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t* pme,
 306                                            CodePath /*unused*/,
 307                                            t_complex*& grid,
 308                                            IVec&       gridSize,
 309                                            IVec&       paddedGridSize)
 310 {
 311     grid = pmeGetComplexGridInternal(pme);
 312     pmeGetComplexGridSizesInternal(pme, gridSize, paddedGridSize);
 313 }
 314
 315 //! PME spline calculation and charge spreading
 316 void pmePerformSplineAndSpread(gmx_pme_t* pme,
 317                                CodePath   mode, // TODO const qualifiers elsewhere
 318                                bool       computeSplines,
 319                                bool       spreadCharges)
 320 {
 321     GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
 322     PmeAtomComm* atc                          = &(pme->atc[0]);
 323     const size_t gridIndex                    = 0;
 324     const bool   computeSplinesForZeroCharges = true;
 325     real**       fftgrid                      = spreadCharges ? pme->fftgrid : nullptr;
 326     real*        pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
 327
 328     switch (mode)
 329     {
 330         case CodePath::CPU:
 331             spread_on_grid(pme,
 332                            atc,
 333                            &pme->pmegrid[gridIndex],
 334                            computeSplines,
 335                            spreadCharges,
 336                            fftgrid != nullptr ? fftgrid[gridIndex] : nullptr,
 337                            computeSplinesForZeroCharges,
 338                            gridIndex);
 339             if (spreadCharges && !pme->bUseThreads)
 340             {
 341                 wrap_periodic_pmegrid(pme, pmegrid);
 342                 copy_pmegrid_to_fftgrid(
 343                         pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex);
 344             }
 345             break;
 346
 347 /* The compiler will complain about passing fftgrid (converting double ** to float **) if using
 348  * double precision. GPUs are not used with double precision anyhow. */
 349 #if !GMX_DOUBLE
 350         case CodePath::GPU:
 351         {
 352             const real lambdaQ = 1.0;
 353             // no synchronization needed as x is transferred in the PME stream
 354             GpuEventSynchronizer* xReadyOnDevice = nullptr;
 355
 356             bool                           useGpuDirectComm         = false;
 357             gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
 358
 359             pme_gpu_spread(pme->gpu,
 360                            xReadyOnDevice,
 361                            fftgrid,
 362                            computeSplines,
 363                            spreadCharges,
 364                            lambdaQ,
 365                            useGpuDirectComm,
 366                            pmeCoordinateReceiverGpu);
 367         }
 368         break;
 369 #endif
 370
 371         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 372     }
 373 }
 374
 375 //! Getting the internal spline data buffer pointer
 376 static real* pmeGetSplineDataInternal(const gmx_pme_t* pme, PmeSplineDataType type, int dimIndex)
 377 {
 378     GMX_ASSERT((0 <= dimIndex) && (dimIndex < DIM), "Invalid dimension index");
 379     const PmeAtomComm* atc          = &(pme->atc[0]);
 380     const size_t       threadIndex  = 0;
 381     real*              splineBuffer = nullptr;
 382     switch (type)
 383     {
 384         case PmeSplineDataType::Values:
 385             splineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
 386             break;
 387
 388         case PmeSplineDataType::Derivatives:
 389             splineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
 390             break;
 391
 392         default: GMX_THROW(InternalError("Unknown spline data type"));
 393     }
 394     return splineBuffer;
 395 }
 396
 397 //! PME solving
 398 void pmePerformSolve(const gmx_pme_t*  pme,
 399                      CodePath          mode,
 400                      PmeSolveAlgorithm method,
 401                      real              cellVolume,
 402                      GridOrdering      gridOrdering,
 403                      bool              computeEnergyAndVirial)
 404 {
 405     t_complex*   h_grid              = pmeGetComplexGridInternal(pme);
 406     const bool   useLorentzBerthelot = false;
 407     const size_t threadIndex         = 0;
 408     const size_t gridIndex           = 0;
 409     switch (mode)
 410     {
 411         case CodePath::CPU:
 412             if (gridOrdering != GridOrdering::YZX)
 413             {
 414                 GMX_THROW(InternalError("Test not implemented for this mode"));
 415             }
 416             switch (method)
 417             {
 418                 case PmeSolveAlgorithm::Coulomb:
 419                     solve_pme_yzx(pme, h_grid, cellVolume, computeEnergyAndVirial, pme->nthread, threadIndex);
 420                     break;
 421
 422                 case PmeSolveAlgorithm::LennardJones:
 423                     solve_pme_lj_yzx(pme,
 424                                      &h_grid,
 425                                      useLorentzBerthelot,
 426                                      cellVolume,
 427                                      computeEnergyAndVirial,
 428                                      pme->nthread,
 429                                      threadIndex);
 430                     break;
 431
 432                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
 433             }
 434             break;
 435
 436         case CodePath::GPU:
 437             switch (method)
 438             {
 439                 case PmeSolveAlgorithm::Coulomb:
 440                     pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial);
 441                     break;
 442
 443                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
 444             }
 445             break;
 446
 447         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 448     }
 449 }
 450
 451 //! PME force gathering
 452 void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces)
 453 {
 454     PmeAtomComm* atc       = &(pme->atc[0]);
 455     const index  atomCount = atc->numAtoms();
 456     GMX_RELEASE_ASSERT(forces.ssize() == atomCount, "Invalid force buffer size");
 457     const real   scale       = 1.0;
 458     const size_t threadIndex = 0;
 459     const size_t gridIndex   = 0;
 460     real*        pmegrid     = pme->pmegrid[gridIndex].grid.grid;
 461     real**       fftgrid     = pme->fftgrid;
 462
 463     switch (mode)
 464     {
 465         case CodePath::CPU:
 466             atc->f = forces;
 467             if (atc->nthread == 1)
 468             {
 469                 // something which is normally done in serial spline computation (make_thread_local_ind())
 470                 atc->spline[threadIndex].n = atomCount;
 471             }
 472             copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex);
 473             unwrap_periodic_pmegrid(pme, pmegrid);
 474             gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale);
 475             break;
 476
 477 /* The compiler will complain about passing fftgrid (converting double ** to float **) if using
 478  * double precision. GPUs are not used with double precision anyhow. */
 479 #if !GMX_DOUBLE
 480         case CodePath::GPU:
 481         {
 482             // Variable initialization needs a non-switch scope
 483             const bool computeEnergyAndVirial = false;
 484             const real lambdaQ                = 1.0;
 485             PmeOutput  output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ);
 486             GMX_ASSERT(forces.size() == output.forces_.size(),
 487                        "Size of force buffers did not match");
 488             pme_gpu_gather(pme->gpu, fftgrid, lambdaQ);
 489             std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
 490         }
 491         break;
 492 #endif
 493
 494         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 495     }
 496 }
 497
 498 //! PME test finalization before fetching the outputs
 499 void pmeFinalizeTest(const gmx_pme_t* pme, CodePath mode)
 500 {
 501     switch (mode)
 502     {
 503         case CodePath::CPU: break;
 504
 505         case CodePath::GPU: pme_gpu_synchronize(pme->gpu); break;
 506
 507         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 508     }
 509 }
 510
 511 //! A binary enum for spline data layout transformation
 512 enum class PmeLayoutTransform
 513 {
 514     GpuToHost,
 515     HostToGpu
 516 };
 517
 518 /*! \brief Gets a unique index to an element in a spline parameter buffer.
 519  *
 520  * These theta/dtheta buffers are laid out for GPU spread/gather
 521  * kernels. The index is wrt the execution block, in range(0,
 522  * atomsPerBlock * order * DIM).
 523  *
 524  * This is a wrapper, only used in unit tests.
 525  * \param[in] order            PME order
 526  * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
 527  * \param[in] dimIndex         Dimension index (from 0 to 2)
 528  * \param[in] atomIndex        Atom index wrt the block.
 529  * \param[in] atomsPerWarp     Number of atoms processed by a warp.
 530  *
 531  * \returns Index into theta or dtheta array using GPU layout.
 532  */
 533 static int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
 534 {
 535     if (order != c_pmeGpuOrder)
 536     {
 537         throw order;
 538     }
 539     constexpr int fixedOrder = c_pmeGpuOrder;
 540     GMX_UNUSED_VALUE(fixedOrder);
 541
 542     const int atomWarpIndex = atomIndex % atomsPerWarp;
 543     const int warpIndex     = atomIndex / atomsPerWarp;
 544     int       indexBase, result;
 545     switch (atomsPerWarp)
 546     {
 547         case 1:
 548             indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
 549             result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
 550             break;
 551
 552         case 2:
 553             indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
 554             result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
 555             break;
 556
 557         case 4:
 558             indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
 559             result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
 560             break;
 561
 562         case 8:
 563             indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
 564             result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
 565             break;
 566
 567         default:
 568             GMX_THROW(NotImplementedError(
 569                     formatString("Test function call not unrolled for atomsPerWarp = %d in "
 570                                  "getSplineParamFullIndex",
 571                                  atomsPerWarp)));
 572     }
 573     return result;
 574 }
 575
 576 /*!\brief Return the number of atoms per warp */
 577 static int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
 578 {
 579     const int order = pmeGpu->common->pme_order;
 580     const int threadsPerAtom =
 581             (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? order : order * order);
 582     return pmeGpu->programHandle_->warpSize() / threadsPerAtom;
 583 }
 584
 585 /*! \brief Rearranges the atom spline data between the GPU and host layouts.
 586  * Only used for test purposes so far, likely to be horribly slow.
 587  *
 588  * \param[in]  pmeGpu     The PME GPU structure.
 589  * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
 590  * \param[in]  type       The spline data type (values or derivatives).
 591  * \param[in]  dimIndex   Dimension index.
 592  * \param[in]  transform  Layout transform type
 593  */
 594 static void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
 595                                                const PmeAtomComm* atc,
 596                                                PmeSplineDataType  type,
 597                                                int                dimIndex,
 598                                                PmeLayoutTransform transform)
 599 {
 600     // The GPU atom spline data is laid out in a different way currently than the CPU one.
 601     // This function converts the data from GPU to CPU layout (in the host memory).
 602     // It is only intended for testing purposes so far.
 603     // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
 604     // performance (e.g. spreading on GPU, gathering on CPU).
 605     GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
 606     const uintmax_t threadIndex  = 0;
 607     const auto      atomCount    = atc->numAtoms();
 608     const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
 609     const auto      pmeOrder     = pmeGpu->common->pme_order;
 610     GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
 611
 612     real*  cpuSplineBuffer;
 613     float* h_splineBuffer;
 614     switch (type)
 615     {
 616         case PmeSplineDataType::Values:
 617             cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
 618             h_splineBuffer  = pmeGpu->staging.h_theta;
 619             break;
 620
 621         case PmeSplineDataType::Derivatives:
 622             cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
 623             h_splineBuffer  = pmeGpu->staging.h_dtheta;
 624             break;
 625
 626         default: GMX_THROW(InternalError("Unknown spline data type"));
 627     }
 628
 629     for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
 630     {
 631         for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
 632         {
 633             const auto gpuValueIndex =
 634                     getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
 635             const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
 636             GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
 637                        "Atom spline data index out of bounds (while transforming GPU data layout "
 638                        "for host)");
 639             switch (transform)
 640             {
 641                 case PmeLayoutTransform::GpuToHost:
 642                     cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
 643                     break;
 644
 645                 case PmeLayoutTransform::HostToGpu:
 646                     h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
 647                     break;
 648
 649                 default: GMX_THROW(InternalError("Unknown layout transform"));
 650             }
 651         }
 652     }
 653 }
 654
 655 //! Setting atom spline values/derivatives to be used in spread/gather
 656 void pmeSetSplineData(const gmx_pme_t*             pme,
 657                       CodePath                     mode,
 658                       const SplineParamsDimVector& splineValues,
 659                       PmeSplineDataType            type,
 660                       int                          dimIndex)
 661 {
 662     const PmeAtomComm* atc       = &(pme->atc[0]);
 663     const index        atomCount = atc->numAtoms();
 664     const index        pmeOrder  = pme->pme_order;
 665     const index        dimSize   = pmeOrder * atomCount;
 666     GMX_RELEASE_ASSERT(dimSize == splineValues.ssize(), "Mismatch in spline data");
 667     real* splineBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
 668
 669     switch (mode)
 670     {
 671         case CodePath::CPU:
 672             std::copy(splineValues.begin(), splineValues.end(), splineBuffer);
 673             break;
 674
 675         case CodePath::GPU:
 676             std::copy(splineValues.begin(), splineValues.end(), splineBuffer);
 677             pme_gpu_transform_spline_atom_data(pme->gpu, atc, type, dimIndex, PmeLayoutTransform::HostToGpu);
 678             break;
 679
 680         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 681     }
 682 }
 683
 684 //! Setting gridline indices to be used in spread/gather
 685 void pmeSetGridLineIndices(gmx_pme_t* pme, CodePath mode, const GridLineIndicesVector& gridLineIndices)
 686 {
 687     PmeAtomComm* atc       = &(pme->atc[0]);
 688     const index  atomCount = atc->numAtoms();
 689     GMX_RELEASE_ASSERT(atomCount == ssize(gridLineIndices), "Mismatch in gridline indices size");
 690
 691     IVec paddedGridSizeUnused, gridSize(0, 0, 0);
 692     pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
 693
 694     for (const auto& index : gridLineIndices)
 695     {
 696         for (int i = 0; i < DIM; i++)
 697         {
 698             GMX_RELEASE_ASSERT((0 <= index[i]) && (index[i] < gridSize[i]),
 699                                "Invalid gridline index");
 700         }
 701     }
 702
 703     switch (mode)
 704     {
 705         case CodePath::GPU:
 706             memcpy(pme_gpu_staging(pme->gpu).h_gridlineIndices,
 707                    gridLineIndices.data(),
 708                    atomCount * sizeof(gridLineIndices[0]));
 709             break;
 710
 711         case CodePath::CPU:
 712             atc->idx.resize(gridLineIndices.size());
 713             std::copy(gridLineIndices.begin(), gridLineIndices.end(), atc->idx.begin());
 714             break;
 715         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 716     }
 717 }
 718
 719 //! Getting plain index into the complex 3d grid
 720 inline size_t pmeGetGridPlainIndexInternal(const IVec& index, const IVec& paddedGridSize, GridOrdering gridOrdering)
 721 {
 722     size_t result;
 723     switch (gridOrdering)
 724     {
 725         case GridOrdering::YZX:
 726             result = (index[YY] * paddedGridSize[ZZ] + index[ZZ]) * paddedGridSize[XX] + index[XX];
 727             break;
 728
 729         case GridOrdering::XYZ:
 730             result = (index[XX] * paddedGridSize[YY] + index[YY]) * paddedGridSize[ZZ] + index[ZZ];
 731             break;
 732
 733         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 734     }
 735     return result;
 736 }
 737
 738 //! Setting real or complex grid
 739 template<typename ValueType>
 740 static void pmeSetGridInternal(const gmx_pme_t*                        pme,
 741                                CodePath                                mode,
 742                                GridOrdering                            gridOrdering,
 743                                const SparseGridValuesInput<ValueType>& gridValues)
 744 {
 745     IVec       gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
 746     ValueType* grid;
 747     pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
 748
 749     switch (mode)
 750     {
 751         case CodePath::GPU: // intentional absence of break, the grid will be copied from the host buffer in testing mode
 752         case CodePath::CPU:
 753             std::memset(grid, 0, paddedGridSize[XX] * paddedGridSize[YY] * paddedGridSize[ZZ] * sizeof(ValueType));
 754             for (const auto& gridValue : gridValues)
 755             {
 756                 for (int i = 0; i < DIM; i++)
 757                 {
 758                     GMX_RELEASE_ASSERT((0 <= gridValue.first[i]) && (gridValue.first[i] < gridSize[i]),
 759                                        "Invalid grid value index");
 760                 }
 761                 const size_t gridValueIndex =
 762                         pmeGetGridPlainIndexInternal(gridValue.first, paddedGridSize, gridOrdering);
 763                 grid[gridValueIndex] = gridValue.second;
 764             }
 765             break;
 766
 767         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 768     }
 769 }
 770
 771 //! Setting real grid to be used in gather
 772 void pmeSetRealGrid(const gmx_pme_t* pme, CodePath mode, const SparseRealGridValuesInput& gridValues)
 773 {
 774     pmeSetGridInternal<real>(pme, mode, GridOrdering::XYZ, gridValues);
 775 }
 776
 777 //! Setting complex grid to be used in solve
 778 void pmeSetComplexGrid(const gmx_pme_t*                    pme,
 779                        CodePath                            mode,
 780                        GridOrdering                        gridOrdering,
 781                        const SparseComplexGridValuesInput& gridValues)
 782 {
 783     pmeSetGridInternal<t_complex>(pme, mode, gridOrdering, gridValues);
 784 }
 785
 786 //! Getting the single dimension's spline values or derivatives
 787 SplineParamsDimVector pmeGetSplineData(const gmx_pme_t* pme, CodePath mode, PmeSplineDataType type, int dimIndex)
 788 {
 789     GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
 790     const PmeAtomComm* atc       = &(pme->atc[0]);
 791     const size_t       atomCount = atc->numAtoms();
 792     const size_t       pmeOrder  = pme->pme_order;
 793     const size_t       dimSize   = pmeOrder * atomCount;
 794
 795     real*                 sourceBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
 796     SplineParamsDimVector result;
 797     switch (mode)
 798     {
 799         case CodePath::GPU:
 800             pme_gpu_transform_spline_atom_data(pme->gpu, atc, type, dimIndex, PmeLayoutTransform::GpuToHost);
 801             result = arrayRefFromArray(sourceBuffer, dimSize);
 802             break;
 803
 804         case CodePath::CPU: result = arrayRefFromArray(sourceBuffer, dimSize); break;
 805
 806         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 807     }
 808     return result;
 809 }
 810
 811 //! Getting the gridline indices
 812 GridLineIndicesVector pmeGetGridlineIndices(const gmx_pme_t* pme, CodePath mode)
 813 {
 814     GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
 815     const PmeAtomComm* atc       = &(pme->atc[0]);
 816     const size_t       atomCount = atc->numAtoms();
 817
 818     GridLineIndicesVector gridLineIndices;
 819     switch (mode)
 820     {
 821         case CodePath::GPU:
 822         {
 823             auto* gridlineIndicesAsIVec =
 824                     reinterpret_cast<IVec*>(pme_gpu_staging(pme->gpu).h_gridlineIndices);
 825             ArrayRef<IVec> gridlineIndicesArrayRef = arrayRefFromArray(gridlineIndicesAsIVec, atomCount);
 826             gridLineIndices = { gridlineIndicesArrayRef.begin(), gridlineIndicesArrayRef.end() };
 827         }
 828         break;
 829
 830         case CodePath::CPU: gridLineIndices = { atc->idx.begin(), atc->idx.end() }; break;
 831
 832         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 833     }
 834     return gridLineIndices;
 835 }
 836
 837 //! Getting real or complex grid - only non zero values
 838 template<typename ValueType>
 839 static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t* pme,
 840                                                             CodePath         mode,
 841                                                             GridOrdering     gridOrdering)
 842 {
 843     IVec       gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
 844     ValueType* grid;
 845     pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
 846     SparseGridValuesOutput<ValueType> gridValues;
 847     switch (mode)
 848     {
 849         case CodePath::GPU: // intentional absence of break
 850         case CodePath::CPU:
 851             gridValues.clear();
 852             for (int ix = 0; ix < gridSize[XX]; ix++)
 853             {
 854                 for (int iy = 0; iy < gridSize[YY]; iy++)
 855                 {
 856                     for (int iz = 0; iz < gridSize[ZZ]; iz++)
 857                     {
 858                         IVec         temp(ix, iy, iz);
 859                         const size_t gridValueIndex =
 860                                 pmeGetGridPlainIndexInternal(temp, paddedGridSize, gridOrdering);
 861                         const ValueType value = grid[gridValueIndex];
 862                         if (value != ValueType{})
 863                         {
 864                             auto key        = formatString("Cell %d %d %d", ix, iy, iz);
 865                             gridValues[key] = value;
 866                         }
 867                     }
 868                 }
 869             }
 870             break;
 871
 872         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 873     }
 874     return gridValues;
 875 }
 876
 877 //! Getting the real grid (spreading output of pmePerformSplineAndSpread())
 878 SparseRealGridValuesOutput pmeGetRealGrid(const gmx_pme_t* pme, CodePath mode)
 879 {
 880     return pmeGetGridInternal<real>(pme, mode, GridOrdering::XYZ);
 881 }
 882
 883 //! Getting the complex grid output of pmePerformSolve()
 884 SparseComplexGridValuesOutput pmeGetComplexGrid(const gmx_pme_t* pme, CodePath mode, GridOrdering gridOrdering)
 885 {
 886     return pmeGetGridInternal<t_complex>(pme, mode, gridOrdering);
 887 }
 888
 889 //! Getting the reciprocal energy and virial
 890 PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method)
 891 {
 892     PmeOutput  output;
 893     const real lambdaQ = 1.0;
 894     switch (mode)
 895     {
 896         case CodePath::CPU:
 897             switch (method)
 898             {
 899                 case PmeSolveAlgorithm::Coulomb:
 900                     get_pme_ener_vir_q(pme->solve_work, pme->nthread, &output);
 901                     break;
 902
 903                 case PmeSolveAlgorithm::LennardJones:
 904                     get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &output);
 905                     break;
 906
 907                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
 908             }
 909             break;
 910         case CodePath::GPU:
 911             switch (method)
 912             {
 913                 case PmeSolveAlgorithm::Coulomb:
 914                     pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output);
 915                     break;
 916
 917                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
 918             }
 919             break;
 920
 921         default: GMX_THROW(InternalError("Test not implemented for this mode"));
 922     }
 923     return output;
 924 }
 925
 926 PmeTestHardwareContext::PmeTestHardwareContext() : codePath_(CodePath::CPU) {}
 927
 928 PmeTestHardwareContext::PmeTestHardwareContext(TestDevice* testDevice) :
 929     codePath_(CodePath::GPU), testDevice_(testDevice)
 930 {
 931     setActiveDevice(testDevice_->deviceInfo());
 932     pmeGpuProgram_ = buildPmeGpuProgram(testDevice_->deviceContext());
 933 }
 934
 935 //! Returns a human-readable context description line
 936 std::string PmeTestHardwareContext::description() const
 937 {
 938     switch (codePath_)
 939     {
 940         case CodePath::CPU: return "CPU";
 941         case CodePath::GPU: return "GPU (" + testDevice_->description() + ")";
 942         default: return "Unknown code path.";
 943     }
 944 }
 945
 946 void PmeTestHardwareContext::activate() const
 947 {
 948     if (codePath_ == CodePath::GPU)
 949     {
 950         setActiveDevice(testDevice_->deviceInfo());
 951     }
 952 }
 953
 954 std::vector<std::unique_ptr<PmeTestHardwareContext>> createPmeTestHardwareContextList()
 955 {
 956     std::vector<std::unique_ptr<PmeTestHardwareContext>> pmeTestHardwareContextList;
 957     // Add CPU
 958     pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>());
 959     // Add GPU devices
 960     const auto& testDeviceList = getTestHardwareEnvironment()->getTestDeviceList();
 961     for (const auto& testDevice : testDeviceList)
 962     {
 963         pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>(testDevice.get()));
 964     }
 965     return pmeTestHardwareContextList;
 966 }
 967
 968 } // namespace test
 969 } // namespace gmx