Refactor PME tests for better usability
[alexxy/gromacs.git] / src / gromacs / ewald / tests / pmetestcommon.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
5  * Copyright (c) 2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 /*! \internal \file
37  * \brief
38  * Implements common routines for PME tests.
39  *
40  * \author Aleksei Iupinov <a.yupinov@gmail.com>
41  * \ingroup module_ewald
42  */
43 #include "gmxpre.h"
44
45 #include "pmetestcommon.h"
46
47 #include <cstring>
48
49 #include <algorithm>
50
51 #include "gromacs/domdec/domdec.h"
52 #include "gromacs/ewald/pme_gather.h"
53 #include "gromacs/ewald/pme_gpu_calculate_splines.h"
54 #include "gromacs/ewald/pme_gpu_constants.h"
55 #include "gromacs/ewald/pme_gpu_internal.h"
56 #include "gromacs/ewald/pme_gpu_staging.h"
57 #include "gromacs/ewald/pme_grid.h"
58 #include "gromacs/ewald/pme_internal.h"
59 #include "gromacs/ewald/pme_redistribute.h"
60 #include "gromacs/ewald/pme_solve.h"
61 #include "gromacs/ewald/pme_spread.h"
62 #include "gromacs/fft/parallel_3dfft.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/hardware/device_management.h"
65 #include "gromacs/math/invertmatrix.h"
66 #include "gromacs/mdtypes/commrec.h"
67 #include "gromacs/pbcutil/pbc.h"
68 #include "gromacs/topology/topology.h"
69 #include "gromacs/utility/exceptions.h"
70 #include "gromacs/utility/gmxassert.h"
71 #include "gromacs/utility/logger.h"
72 #include "gromacs/utility/stringutil.h"
73 #include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
74
75 #include "testutils/test_hardware_environment.h"
76 #include "testutils/testasserts.h"
77
78 class DeviceContext;
79
80 namespace gmx
81 {
82 namespace test
83 {
84
85 //! A couple of valid inputs for boxes.
86 const std::map<std::string, Matrix3x3> c_inputBoxes = {
87     { "rect", { { 8.0F, 0.0F, 0.0F, 0.0F, 3.4F, 0.0F, 0.0F, 0.0F, 2.0F } } },
88     { "tric", { { 7.0F, 0.0F, 0.0F, 0.0F, 4.1F, 0.0F, 3.5F, 2.0F, 12.2F } } },
89 };
90
91 //! Valid PME orders for testing
92 std::vector<int> c_inputPmeOrders{ 3, 4, 5 };
93
94 bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode)
95 {
96     bool implemented;
97     switch (mode)
98     {
99         case CodePath::CPU: implemented = true; break;
100
101         case CodePath::GPU:
102             implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr)
103                            && pme_gpu_supports_input(*inputRec, nullptr));
104             break;
105
106         default: GMX_THROW(InternalError("Test not implemented for this mode"));
107     }
108     return implemented;
109 }
110
111 uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
112 {
113     /* Arbitrary ulp tolerance for sine/cosine implementation. It's
114      * hard to know what to pick without testing lots of
115      * implementations. */
116     const uint64_t sineUlps = 10;
117     return 4 * (splineOrder - 2) + 2 * sineUlps * splineOrder;
118 }
119
120 //! PME initialization
121 PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
122                               const CodePath       mode,
123                               const DeviceContext* deviceContext,
124                               const DeviceStream*  deviceStream,
125                               const PmeGpuProgram* pmeGpuProgram,
126                               const Matrix3x3&     box,
127                               const real           ewaldCoeff_q,
128                               const real           ewaldCoeff_lj)
129 {
130     const MDLogger dummyLogger;
131     const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
132     t_commrec      dummyCommrec  = { 0 };
133     NumPmeDomains  numPmeDomains = { 1, 1 };
134     gmx_pme_t*     pmeDataRaw    = gmx_pme_init(&dummyCommrec,
135                                          numPmeDomains,
136                                          inputRec,
137                                          false,
138                                          false,
139                                          true,
140                                          ewaldCoeff_q,
141                                          ewaldCoeff_lj,
142                                          1,
143                                          runMode,
144                                          nullptr,
145                                          deviceContext,
146                                          deviceStream,
147                                          pmeGpuProgram,
148                                          dummyLogger);
149     PmeSafePointer pme(pmeDataRaw); // taking ownership
150
151     // TODO get rid of this with proper matrix type
152     matrix boxTemp;
153     for (int i = 0; i < DIM; i++)
154     {
155         for (int j = 0; j < DIM; j++)
156         {
157             boxTemp[i][j] = box[i * DIM + j];
158         }
159     }
160     const char* boxError = check_box(PbcType::Unset, boxTemp);
161     GMX_RELEASE_ASSERT(boxError == nullptr, boxError);
162
163     switch (mode)
164     {
165         case CodePath::CPU: invertBoxMatrix(boxTemp, pme->recipbox); break;
166
167         case CodePath::GPU:
168             pme_gpu_set_testing(pme->gpu, true);
169             pme_gpu_update_input_box(pme->gpu, boxTemp);
170             break;
171
172         default: GMX_THROW(InternalError("Test not implemented for this mode"));
173     }
174
175     return pme;
176 }
177
178 PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
179 {
180     const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
181     return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
182 }
183
184 //! Make a GPU state-propagator manager
185 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
186                                                                    const DeviceContext* deviceContext,
187                                                                    const DeviceStream* deviceStream)
188 {
189     // TODO: Pin the host buffer and use async memory copies
190     // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
191     //       restrict one from using other constructor here.
192     return std::make_unique<StatePropagatorDataGpu>(
193             deviceStream, *deviceContext, GpuApiCallBehavior::Sync, pme_gpu_get_block_size(&pme), nullptr);
194 }
195
196 //! PME initialization with atom data
197 void pmeInitAtoms(gmx_pme_t*               pme,
198                   StatePropagatorDataGpu*  stateGpu,
199                   const CodePath           mode,
200                   const CoordinatesVector& coordinates,
201                   const ChargesVector&     charges)
202 {
203     const index atomCount = coordinates.size();
204     GMX_RELEASE_ASSERT(atomCount == gmx::ssize(charges), "Mismatch in atom data");
205     PmeAtomComm* atc = nullptr;
206
207     switch (mode)
208     {
209         case CodePath::CPU:
210             atc              = &(pme->atc[0]);
211             atc->x           = coordinates;
212             atc->coefficient = charges;
213             gmx_pme_reinit_atoms(pme, atomCount, charges, {});
214             /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
215             break;
216
217         case CodePath::GPU:
218             // TODO: Avoid use of atc in the GPU code path
219             atc = &(pme->atc[0]);
220             // We need to set atc->n for passing the size in the tests
221             atc->setNumAtoms(atomCount);
222             gmx_pme_reinit_atoms(pme, atomCount, charges, {});
223
224             stateGpu->reinit(atomCount, atomCount);
225             stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()),
226                                            gmx::AtomLocality::Local);
227             pme_gpu_set_kernelparam_coordinates(pme->gpu, stateGpu->getCoordinates());
228
229             break;
230
231         default: GMX_THROW(InternalError("Test not implemented for this mode"));
232     }
233 }
234
235 //! Getting local PME real grid pointer for test I/O
236 static real* pmeGetRealGridInternal(const gmx_pme_t* pme)
237 {
238     const size_t gridIndex = 0;
239     return pme->fftgrid[gridIndex];
240 }
241
242 //! Getting local PME real grid dimensions
243 static void pmeGetRealGridSizesInternal(const gmx_pme_t* pme,
244                                         CodePath         mode,
245                                         IVec& gridSize,       //NOLINT(google-runtime-references)
246                                         IVec& paddedGridSize) //NOLINT(google-runtime-references)
247 {
248     const size_t gridIndex = 0;
249     IVec         gridOffsetUnused;
250     switch (mode)
251     {
252         case CodePath::CPU:
253             gmx_parallel_3dfft_real_limits(
254                     pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
255             break;
256
257         case CodePath::GPU:
258             pme_gpu_get_real_grid_sizes(pme->gpu, &gridSize, &paddedGridSize);
259             break;
260
261         default: GMX_THROW(InternalError("Test not implemented for this mode"));
262     }
263 }
264
265 //! Getting local PME complex grid pointer for test I/O
266 static t_complex* pmeGetComplexGridInternal(const gmx_pme_t* pme)
267 {
268     const size_t gridIndex = 0;
269     return pme->cfftgrid[gridIndex];
270 }
271
272 //! Getting local PME complex grid dimensions
273 static void pmeGetComplexGridSizesInternal(const gmx_pme_t* pme,
274                                            IVec& gridSize,       //NOLINT(google-runtime-references)
275                                            IVec& paddedGridSize) //NOLINT(google-runtime-references)
276 {
277     const size_t gridIndex = 0;
278     IVec         gridOffsetUnused, complexOrderUnused;
279     gmx_parallel_3dfft_complex_limits(
280             pme->pfft_setup[gridIndex], complexOrderUnused, gridSize, gridOffsetUnused, paddedGridSize); // TODO: what about YZX ordering?
281 }
282
283 //! Getting the PME grid memory buffer and its sizes - template definition
284 template<typename ValueType>
285 static void pmeGetGridAndSizesInternal(const gmx_pme_t* /*unused*/,
286                                        CodePath /*unused*/,
287                                        ValueType*& /*unused*/, //NOLINT(google-runtime-references)
288                                        IVec& /*unused*/,       //NOLINT(google-runtime-references)
289                                        IVec& /*unused*/)       //NOLINT(google-runtime-references)
290 {
291     GMX_THROW(InternalError("Deleted function call"));
292     // explicitly deleting general template does not compile in clang, see https://llvm.org/bugs/show_bug.cgi?id=17537
293 }
294
295 //! Getting the PME real grid memory buffer and its sizes
296 template<>
297 void pmeGetGridAndSizesInternal<real>(const gmx_pme_t* pme, CodePath mode, real*& grid, IVec& gridSize, IVec& paddedGridSize)
298 {
299     grid = pmeGetRealGridInternal(pme);
300     pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSize);
301 }
302
303 //! Getting the PME complex grid memory buffer and its sizes
304 template<>
305 void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t* pme,
306                                            CodePath /*unused*/,
307                                            t_complex*& grid,
308                                            IVec&       gridSize,
309                                            IVec&       paddedGridSize)
310 {
311     grid = pmeGetComplexGridInternal(pme);
312     pmeGetComplexGridSizesInternal(pme, gridSize, paddedGridSize);
313 }
314
315 //! PME spline calculation and charge spreading
316 void pmePerformSplineAndSpread(gmx_pme_t* pme,
317                                CodePath   mode, // TODO const qualifiers elsewhere
318                                bool       computeSplines,
319                                bool       spreadCharges)
320 {
321     GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
322     PmeAtomComm* atc                          = &(pme->atc[0]);
323     const size_t gridIndex                    = 0;
324     const bool   computeSplinesForZeroCharges = true;
325     real**       fftgrid                      = spreadCharges ? pme->fftgrid : nullptr;
326     real*        pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
327
328     switch (mode)
329     {
330         case CodePath::CPU:
331             spread_on_grid(pme,
332                            atc,
333                            &pme->pmegrid[gridIndex],
334                            computeSplines,
335                            spreadCharges,
336                            fftgrid != nullptr ? fftgrid[gridIndex] : nullptr,
337                            computeSplinesForZeroCharges,
338                            gridIndex);
339             if (spreadCharges && !pme->bUseThreads)
340             {
341                 wrap_periodic_pmegrid(pme, pmegrid);
342                 copy_pmegrid_to_fftgrid(
343                         pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex);
344             }
345             break;
346
347 /* The compiler will complain about passing fftgrid (converting double ** to float **) if using
348  * double precision. GPUs are not used with double precision anyhow. */
349 #if !GMX_DOUBLE
350         case CodePath::GPU:
351         {
352             const real lambdaQ = 1.0;
353             // no synchronization needed as x is transferred in the PME stream
354             GpuEventSynchronizer* xReadyOnDevice = nullptr;
355
356             bool                           useGpuDirectComm         = false;
357             gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
358
359             pme_gpu_spread(pme->gpu,
360                            xReadyOnDevice,
361                            fftgrid,
362                            computeSplines,
363                            spreadCharges,
364                            lambdaQ,
365                            useGpuDirectComm,
366                            pmeCoordinateReceiverGpu);
367         }
368         break;
369 #endif
370
371         default: GMX_THROW(InternalError("Test not implemented for this mode"));
372     }
373 }
374
375 //! Getting the internal spline data buffer pointer
376 static real* pmeGetSplineDataInternal(const gmx_pme_t* pme, PmeSplineDataType type, int dimIndex)
377 {
378     GMX_ASSERT((0 <= dimIndex) && (dimIndex < DIM), "Invalid dimension index");
379     const PmeAtomComm* atc          = &(pme->atc[0]);
380     const size_t       threadIndex  = 0;
381     real*              splineBuffer = nullptr;
382     switch (type)
383     {
384         case PmeSplineDataType::Values:
385             splineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
386             break;
387
388         case PmeSplineDataType::Derivatives:
389             splineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
390             break;
391
392         default: GMX_THROW(InternalError("Unknown spline data type"));
393     }
394     return splineBuffer;
395 }
396
397 //! PME solving
398 void pmePerformSolve(const gmx_pme_t*  pme,
399                      CodePath          mode,
400                      PmeSolveAlgorithm method,
401                      real              cellVolume,
402                      GridOrdering      gridOrdering,
403                      bool              computeEnergyAndVirial)
404 {
405     t_complex*   h_grid              = pmeGetComplexGridInternal(pme);
406     const bool   useLorentzBerthelot = false;
407     const size_t threadIndex         = 0;
408     const size_t gridIndex           = 0;
409     switch (mode)
410     {
411         case CodePath::CPU:
412             if (gridOrdering != GridOrdering::YZX)
413             {
414                 GMX_THROW(InternalError("Test not implemented for this mode"));
415             }
416             switch (method)
417             {
418                 case PmeSolveAlgorithm::Coulomb:
419                     solve_pme_yzx(pme, h_grid, cellVolume, computeEnergyAndVirial, pme->nthread, threadIndex);
420                     break;
421
422                 case PmeSolveAlgorithm::LennardJones:
423                     solve_pme_lj_yzx(pme,
424                                      &h_grid,
425                                      useLorentzBerthelot,
426                                      cellVolume,
427                                      computeEnergyAndVirial,
428                                      pme->nthread,
429                                      threadIndex);
430                     break;
431
432                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
433             }
434             break;
435
436         case CodePath::GPU:
437             switch (method)
438             {
439                 case PmeSolveAlgorithm::Coulomb:
440                     pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial);
441                     break;
442
443                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
444             }
445             break;
446
447         default: GMX_THROW(InternalError("Test not implemented for this mode"));
448     }
449 }
450
451 //! PME force gathering
452 void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces)
453 {
454     PmeAtomComm* atc       = &(pme->atc[0]);
455     const index  atomCount = atc->numAtoms();
456     GMX_RELEASE_ASSERT(forces.ssize() == atomCount, "Invalid force buffer size");
457     const real   scale       = 1.0;
458     const size_t threadIndex = 0;
459     const size_t gridIndex   = 0;
460     real*        pmegrid     = pme->pmegrid[gridIndex].grid.grid;
461     real**       fftgrid     = pme->fftgrid;
462
463     switch (mode)
464     {
465         case CodePath::CPU:
466             atc->f = forces;
467             if (atc->nthread == 1)
468             {
469                 // something which is normally done in serial spline computation (make_thread_local_ind())
470                 atc->spline[threadIndex].n = atomCount;
471             }
472             copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex);
473             unwrap_periodic_pmegrid(pme, pmegrid);
474             gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale);
475             break;
476
477 /* The compiler will complain about passing fftgrid (converting double ** to float **) if using
478  * double precision. GPUs are not used with double precision anyhow. */
479 #if !GMX_DOUBLE
480         case CodePath::GPU:
481         {
482             // Variable initialization needs a non-switch scope
483             const bool computeEnergyAndVirial = false;
484             const real lambdaQ                = 1.0;
485             PmeOutput  output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ);
486             GMX_ASSERT(forces.size() == output.forces_.size(),
487                        "Size of force buffers did not match");
488             pme_gpu_gather(pme->gpu, fftgrid, lambdaQ);
489             std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
490         }
491         break;
492 #endif
493
494         default: GMX_THROW(InternalError("Test not implemented for this mode"));
495     }
496 }
497
498 //! PME test finalization before fetching the outputs
499 void pmeFinalizeTest(const gmx_pme_t* pme, CodePath mode)
500 {
501     switch (mode)
502     {
503         case CodePath::CPU: break;
504
505         case CodePath::GPU: pme_gpu_synchronize(pme->gpu); break;
506
507         default: GMX_THROW(InternalError("Test not implemented for this mode"));
508     }
509 }
510
511 //! A binary enum for spline data layout transformation
512 enum class PmeLayoutTransform
513 {
514     GpuToHost,
515     HostToGpu
516 };
517
518 /*! \brief Gets a unique index to an element in a spline parameter buffer.
519  *
520  * These theta/dtheta buffers are laid out for GPU spread/gather
521  * kernels. The index is wrt the execution block, in range(0,
522  * atomsPerBlock * order * DIM).
523  *
524  * This is a wrapper, only used in unit tests.
525  * \param[in] order            PME order
526  * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
527  * \param[in] dimIndex         Dimension index (from 0 to 2)
528  * \param[in] atomIndex        Atom index wrt the block.
529  * \param[in] atomsPerWarp     Number of atoms processed by a warp.
530  *
531  * \returns Index into theta or dtheta array using GPU layout.
532  */
533 static int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
534 {
535     if (order != c_pmeGpuOrder)
536     {
537         throw order;
538     }
539     constexpr int fixedOrder = c_pmeGpuOrder;
540     GMX_UNUSED_VALUE(fixedOrder);
541
542     const int atomWarpIndex = atomIndex % atomsPerWarp;
543     const int warpIndex     = atomIndex / atomsPerWarp;
544     int       indexBase, result;
545     switch (atomsPerWarp)
546     {
547         case 1:
548             indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
549             result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
550             break;
551
552         case 2:
553             indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
554             result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
555             break;
556
557         case 4:
558             indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
559             result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
560             break;
561
562         case 8:
563             indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
564             result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
565             break;
566
567         default:
568             GMX_THROW(NotImplementedError(
569                     formatString("Test function call not unrolled for atomsPerWarp = %d in "
570                                  "getSplineParamFullIndex",
571                                  atomsPerWarp)));
572     }
573     return result;
574 }
575
576 /*!\brief Return the number of atoms per warp */
577 static int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
578 {
579     const int order = pmeGpu->common->pme_order;
580     const int threadsPerAtom =
581             (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? order : order * order);
582     return pmeGpu->programHandle_->warpSize() / threadsPerAtom;
583 }
584
585 /*! \brief Rearranges the atom spline data between the GPU and host layouts.
586  * Only used for test purposes so far, likely to be horribly slow.
587  *
588  * \param[in]  pmeGpu     The PME GPU structure.
589  * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
590  * \param[in]  type       The spline data type (values or derivatives).
591  * \param[in]  dimIndex   Dimension index.
592  * \param[in]  transform  Layout transform type
593  */
594 static void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
595                                                const PmeAtomComm* atc,
596                                                PmeSplineDataType  type,
597                                                int                dimIndex,
598                                                PmeLayoutTransform transform)
599 {
600     // The GPU atom spline data is laid out in a different way currently than the CPU one.
601     // This function converts the data from GPU to CPU layout (in the host memory).
602     // It is only intended for testing purposes so far.
603     // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
604     // performance (e.g. spreading on GPU, gathering on CPU).
605     GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
606     const uintmax_t threadIndex  = 0;
607     const auto      atomCount    = atc->numAtoms();
608     const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
609     const auto      pmeOrder     = pmeGpu->common->pme_order;
610     GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
611
612     real*  cpuSplineBuffer;
613     float* h_splineBuffer;
614     switch (type)
615     {
616         case PmeSplineDataType::Values:
617             cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
618             h_splineBuffer  = pmeGpu->staging.h_theta;
619             break;
620
621         case PmeSplineDataType::Derivatives:
622             cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
623             h_splineBuffer  = pmeGpu->staging.h_dtheta;
624             break;
625
626         default: GMX_THROW(InternalError("Unknown spline data type"));
627     }
628
629     for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
630     {
631         for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
632         {
633             const auto gpuValueIndex =
634                     getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
635             const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
636             GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
637                        "Atom spline data index out of bounds (while transforming GPU data layout "
638                        "for host)");
639             switch (transform)
640             {
641                 case PmeLayoutTransform::GpuToHost:
642                     cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
643                     break;
644
645                 case PmeLayoutTransform::HostToGpu:
646                     h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
647                     break;
648
649                 default: GMX_THROW(InternalError("Unknown layout transform"));
650             }
651         }
652     }
653 }
654
655 //! Setting atom spline values/derivatives to be used in spread/gather
656 void pmeSetSplineData(const gmx_pme_t*             pme,
657                       CodePath                     mode,
658                       const SplineParamsDimVector& splineValues,
659                       PmeSplineDataType            type,
660                       int                          dimIndex)
661 {
662     const PmeAtomComm* atc       = &(pme->atc[0]);
663     const index        atomCount = atc->numAtoms();
664     const index        pmeOrder  = pme->pme_order;
665     const index        dimSize   = pmeOrder * atomCount;
666     GMX_RELEASE_ASSERT(dimSize == splineValues.ssize(), "Mismatch in spline data");
667     real* splineBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
668
669     switch (mode)
670     {
671         case CodePath::CPU:
672             std::copy(splineValues.begin(), splineValues.end(), splineBuffer);
673             break;
674
675         case CodePath::GPU:
676             std::copy(splineValues.begin(), splineValues.end(), splineBuffer);
677             pme_gpu_transform_spline_atom_data(pme->gpu, atc, type, dimIndex, PmeLayoutTransform::HostToGpu);
678             break;
679
680         default: GMX_THROW(InternalError("Test not implemented for this mode"));
681     }
682 }
683
684 //! Setting gridline indices to be used in spread/gather
685 void pmeSetGridLineIndices(gmx_pme_t* pme, CodePath mode, const GridLineIndicesVector& gridLineIndices)
686 {
687     PmeAtomComm* atc       = &(pme->atc[0]);
688     const index  atomCount = atc->numAtoms();
689     GMX_RELEASE_ASSERT(atomCount == ssize(gridLineIndices), "Mismatch in gridline indices size");
690
691     IVec paddedGridSizeUnused, gridSize(0, 0, 0);
692     pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
693
694     for (const auto& index : gridLineIndices)
695     {
696         for (int i = 0; i < DIM; i++)
697         {
698             GMX_RELEASE_ASSERT((0 <= index[i]) && (index[i] < gridSize[i]),
699                                "Invalid gridline index");
700         }
701     }
702
703     switch (mode)
704     {
705         case CodePath::GPU:
706             memcpy(pme_gpu_staging(pme->gpu).h_gridlineIndices,
707                    gridLineIndices.data(),
708                    atomCount * sizeof(gridLineIndices[0]));
709             break;
710
711         case CodePath::CPU:
712             atc->idx.resize(gridLineIndices.size());
713             std::copy(gridLineIndices.begin(), gridLineIndices.end(), atc->idx.begin());
714             break;
715         default: GMX_THROW(InternalError("Test not implemented for this mode"));
716     }
717 }
718
719 //! Getting plain index into the complex 3d grid
720 inline size_t pmeGetGridPlainIndexInternal(const IVec& index, const IVec& paddedGridSize, GridOrdering gridOrdering)
721 {
722     size_t result;
723     switch (gridOrdering)
724     {
725         case GridOrdering::YZX:
726             result = (index[YY] * paddedGridSize[ZZ] + index[ZZ]) * paddedGridSize[XX] + index[XX];
727             break;
728
729         case GridOrdering::XYZ:
730             result = (index[XX] * paddedGridSize[YY] + index[YY]) * paddedGridSize[ZZ] + index[ZZ];
731             break;
732
733         default: GMX_THROW(InternalError("Test not implemented for this mode"));
734     }
735     return result;
736 }
737
738 //! Setting real or complex grid
739 template<typename ValueType>
740 static void pmeSetGridInternal(const gmx_pme_t*                        pme,
741                                CodePath                                mode,
742                                GridOrdering                            gridOrdering,
743                                const SparseGridValuesInput<ValueType>& gridValues)
744 {
745     IVec       gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
746     ValueType* grid;
747     pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
748
749     switch (mode)
750     {
751         case CodePath::GPU: // intentional absence of break, the grid will be copied from the host buffer in testing mode
752         case CodePath::CPU:
753             std::memset(grid, 0, paddedGridSize[XX] * paddedGridSize[YY] * paddedGridSize[ZZ] * sizeof(ValueType));
754             for (const auto& gridValue : gridValues)
755             {
756                 for (int i = 0; i < DIM; i++)
757                 {
758                     GMX_RELEASE_ASSERT((0 <= gridValue.first[i]) && (gridValue.first[i] < gridSize[i]),
759                                        "Invalid grid value index");
760                 }
761                 const size_t gridValueIndex =
762                         pmeGetGridPlainIndexInternal(gridValue.first, paddedGridSize, gridOrdering);
763                 grid[gridValueIndex] = gridValue.second;
764             }
765             break;
766
767         default: GMX_THROW(InternalError("Test not implemented for this mode"));
768     }
769 }
770
771 //! Setting real grid to be used in gather
772 void pmeSetRealGrid(const gmx_pme_t* pme, CodePath mode, const SparseRealGridValuesInput& gridValues)
773 {
774     pmeSetGridInternal<real>(pme, mode, GridOrdering::XYZ, gridValues);
775 }
776
777 //! Setting complex grid to be used in solve
778 void pmeSetComplexGrid(const gmx_pme_t*                    pme,
779                        CodePath                            mode,
780                        GridOrdering                        gridOrdering,
781                        const SparseComplexGridValuesInput& gridValues)
782 {
783     pmeSetGridInternal<t_complex>(pme, mode, gridOrdering, gridValues);
784 }
785
786 //! Getting the single dimension's spline values or derivatives
787 SplineParamsDimVector pmeGetSplineData(const gmx_pme_t* pme, CodePath mode, PmeSplineDataType type, int dimIndex)
788 {
789     GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
790     const PmeAtomComm* atc       = &(pme->atc[0]);
791     const size_t       atomCount = atc->numAtoms();
792     const size_t       pmeOrder  = pme->pme_order;
793     const size_t       dimSize   = pmeOrder * atomCount;
794
795     real*                 sourceBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
796     SplineParamsDimVector result;
797     switch (mode)
798     {
799         case CodePath::GPU:
800             pme_gpu_transform_spline_atom_data(pme->gpu, atc, type, dimIndex, PmeLayoutTransform::GpuToHost);
801             result = arrayRefFromArray(sourceBuffer, dimSize);
802             break;
803
804         case CodePath::CPU: result = arrayRefFromArray(sourceBuffer, dimSize); break;
805
806         default: GMX_THROW(InternalError("Test not implemented for this mode"));
807     }
808     return result;
809 }
810
811 //! Getting the gridline indices
812 GridLineIndicesVector pmeGetGridlineIndices(const gmx_pme_t* pme, CodePath mode)
813 {
814     GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
815     const PmeAtomComm* atc       = &(pme->atc[0]);
816     const size_t       atomCount = atc->numAtoms();
817
818     GridLineIndicesVector gridLineIndices;
819     switch (mode)
820     {
821         case CodePath::GPU:
822         {
823             auto* gridlineIndicesAsIVec =
824                     reinterpret_cast<IVec*>(pme_gpu_staging(pme->gpu).h_gridlineIndices);
825             ArrayRef<IVec> gridlineIndicesArrayRef = arrayRefFromArray(gridlineIndicesAsIVec, atomCount);
826             gridLineIndices = { gridlineIndicesArrayRef.begin(), gridlineIndicesArrayRef.end() };
827         }
828         break;
829
830         case CodePath::CPU: gridLineIndices = { atc->idx.begin(), atc->idx.end() }; break;
831
832         default: GMX_THROW(InternalError("Test not implemented for this mode"));
833     }
834     return gridLineIndices;
835 }
836
837 //! Getting real or complex grid - only non zero values
838 template<typename ValueType>
839 static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t* pme,
840                                                             CodePath         mode,
841                                                             GridOrdering     gridOrdering)
842 {
843     IVec       gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
844     ValueType* grid;
845     pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
846     SparseGridValuesOutput<ValueType> gridValues;
847     switch (mode)
848     {
849         case CodePath::GPU: // intentional absence of break
850         case CodePath::CPU:
851             gridValues.clear();
852             for (int ix = 0; ix < gridSize[XX]; ix++)
853             {
854                 for (int iy = 0; iy < gridSize[YY]; iy++)
855                 {
856                     for (int iz = 0; iz < gridSize[ZZ]; iz++)
857                     {
858                         IVec         temp(ix, iy, iz);
859                         const size_t gridValueIndex =
860                                 pmeGetGridPlainIndexInternal(temp, paddedGridSize, gridOrdering);
861                         const ValueType value = grid[gridValueIndex];
862                         if (value != ValueType{})
863                         {
864                             auto key        = formatString("Cell %d %d %d", ix, iy, iz);
865                             gridValues[key] = value;
866                         }
867                     }
868                 }
869             }
870             break;
871
872         default: GMX_THROW(InternalError("Test not implemented for this mode"));
873     }
874     return gridValues;
875 }
876
877 //! Getting the real grid (spreading output of pmePerformSplineAndSpread())
878 SparseRealGridValuesOutput pmeGetRealGrid(const gmx_pme_t* pme, CodePath mode)
879 {
880     return pmeGetGridInternal<real>(pme, mode, GridOrdering::XYZ);
881 }
882
883 //! Getting the complex grid output of pmePerformSolve()
884 SparseComplexGridValuesOutput pmeGetComplexGrid(const gmx_pme_t* pme, CodePath mode, GridOrdering gridOrdering)
885 {
886     return pmeGetGridInternal<t_complex>(pme, mode, gridOrdering);
887 }
888
889 //! Getting the reciprocal energy and virial
890 PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method)
891 {
892     PmeOutput  output;
893     const real lambdaQ = 1.0;
894     switch (mode)
895     {
896         case CodePath::CPU:
897             switch (method)
898             {
899                 case PmeSolveAlgorithm::Coulomb:
900                     get_pme_ener_vir_q(pme->solve_work, pme->nthread, &output);
901                     break;
902
903                 case PmeSolveAlgorithm::LennardJones:
904                     get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &output);
905                     break;
906
907                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
908             }
909             break;
910         case CodePath::GPU:
911             switch (method)
912             {
913                 case PmeSolveAlgorithm::Coulomb:
914                     pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output);
915                     break;
916
917                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
918             }
919             break;
920
921         default: GMX_THROW(InternalError("Test not implemented for this mode"));
922     }
923     return output;
924 }
925
926 PmeTestHardwareContext::PmeTestHardwareContext() : codePath_(CodePath::CPU) {}
927
928 PmeTestHardwareContext::PmeTestHardwareContext(TestDevice* testDevice) :
929     codePath_(CodePath::GPU), testDevice_(testDevice)
930 {
931     setActiveDevice(testDevice_->deviceInfo());
932     pmeGpuProgram_ = buildPmeGpuProgram(testDevice_->deviceContext());
933 }
934
935 //! Returns a human-readable context description line
936 std::string PmeTestHardwareContext::description() const
937 {
938     switch (codePath_)
939     {
940         case CodePath::CPU: return "CPU";
941         case CodePath::GPU: return "GPU (" + testDevice_->description() + ")";
942         default: return "Unknown code path.";
943     }
944 }
945
946 void PmeTestHardwareContext::activate() const
947 {
948     if (codePath_ == CodePath::GPU)
949     {
950         setActiveDevice(testDevice_->deviceInfo());
951     }
952 }
953
954 std::vector<std::unique_ptr<PmeTestHardwareContext>> createPmeTestHardwareContextList()
955 {
956     std::vector<std::unique_ptr<PmeTestHardwareContext>> pmeTestHardwareContextList;
957     // Add CPU
958     pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>());
959     // Add GPU devices
960     const auto& testDeviceList = getTestHardwareEnvironment()->getTestDeviceList();
961     for (const auto& testDevice : testDeviceList)
962     {
963         pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>(testDevice.get()));
964     }
965     return pmeTestHardwareContextList;
966 }
967
968 } // namespace test
969 } // namespace gmx