/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gpu_macros.h" // for the GPU_FUNC_ macros
-#include "gromacs/utility/arrayref.h"
#include "pme_gpu_types_host.h"
#include "pme_output.h"
-class GpuEventSynchronizer;
+class DeviceContext;
struct DeviceInformation;
+class DeviceStream;
+class GpuEventSynchronizer;
struct gmx_hw_info_t;
struct gmx_gpu_opt_t;
struct gmx_pme_t; // only used in pme_gpu_reinit
struct PmeGpuSettings;
struct t_complex;
+#ifndef FEP_STATE_A
+//! Grid index of FEP state A (or unperturbed system)
+# define FEP_STATE_A 0
+#endif
+#ifndef FEP_STATE_B
+//! Grid index of FEP state B
+# define FEP_STATE_B 1
+#endif
+
namespace gmx
{
+template<typename>
+class ArrayRef;
class MDLogger;
-}
+} // namespace gmx
//! Type of spline data
enum class PmeSplineDataType
};
/*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
- * Depends on CUDA-specific block sizes, needed for the atom data padding.
+ * Returns the size of the block size requirement
*
- * \param[in] pmeGpu The PME GPU structure.
- * \returns Number of atoms in a single GPU atom data chunk.
- */
-int pme_gpu_get_atom_data_alignment(const PmeGpu* pmeGpu);
-
-/*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \returns Number of atoms in a single GPU atom spline data chunk.
+ * \returns Number of atoms in a single GPU atom data chunk, which
+ * determines a minimum divisior of the size of the memory allocated.
*/
-int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu);
+int pme_gpu_get_atom_data_block_size();
/*! \libinternal \brief
* Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
* Reallocates and copies the pre-computed B-spline values to the GPU.
*
* \param[in,out] pmeGpu The PME GPU structure.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu);
+void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, int gridIndex = 0);
/*! \libinternal \brief
* Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
*
* \param[in] pmeGpu The PME GPU structure.
* \param[in] h_coefficients The input atom charges/coefficients.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*
* Does not need to be done for every PME computation, only whenever the local charges change.
* (So, in the beginning of the run, or on DD step).
*/
-void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients);
+void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
+ const float* h_coefficients,
+ int gridIndex = 0);
/*! \libinternal \brief
* Frees the charges/coefficients on the GPU.
/*! \libinternal \brief
* Copies the input real-space grid from the host to the GPU.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \param[in] h_grid The host-side grid buffer.
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] h_grid The host-side grid buffer.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid);
+void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, int gridIndex = 0);
/*! \libinternal \brief
* Copies the output real-space grid from the GPU to the host.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \param[out] h_grid The host-side grid buffer.
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[out] h_grid The host-side grid buffer.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid);
+void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, int gridIndex = 0);
/*! \libinternal \brief
* Copies the spread output spline data and gridline indices from the GPU to the host.
*
- * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] pmeGpu The PME GPU structure.
*/
void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu);
/*! \libinternal \brief
* Copies the gather input spline data and gridline indices from the host to the GPU.
*
- * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] pmeGpu The PME GPU structure.
*/
void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
*/
void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
-/*! \libinternal \brief
- * Does the one-time GPU-framework specific PME initialization.
- * For CUDA, the PME stream is created with the highest priority.
- *
- * \param[in] pmeGpu The PME GPU structure.
- */
-void pme_gpu_init_internal(PmeGpu* pmeGpu);
-
-/*! \libinternal \brief
- * Destroys the PME GPU-framework specific data.
- * Should be called last in the PME GPU destructor.
- *
- * \param[in] pmeGpu The PME GPU structure.
- */
-void pme_gpu_destroy_specific(const PmeGpu* pmeGpu);
-
/*! \libinternal \brief
* Initializes the CUDA FFT structures.
*
/*! \libinternal \brief
* A GPU spline computation and charge spreading function.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory;
- * can be nullptr when invoked on a separate PME rank or from PME tests.
- * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
- * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
- * e.g. testing or host-side FFT)
- * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
- * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
- GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
- int GPU_FUNC_ARGUMENT(gridIndex),
- real* GPU_FUNC_ARGUMENT(h_grid),
- bool GPU_FUNC_ARGUMENT(computeSplines),
- bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are
+ * ready in the device memory; can be nullptr when invoked
+ * on a separate PME rank or from PME tests.
+ * \param[out] h_grids The host-side grid buffers (used only if the result
+ * of the spread is expected on the host, e.g. testing
+ * or host-side FFT)
+ * \param[in] computeSplines Should the computation of spline parameters and gridline
+ * indices be performed.
+ * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
+ * \param[in] lambda The lambda value of the current system state.
+ * \param[in] useGpuDirectComm Whether direct GPU PME-PP communication is active
+ * \param[in] pmeCoordinateReceiverGpu Coordinate receiver object, which must be valid when
+ * direct GPU PME-PP communication is active
+ */
+GPU_FUNC_QUALIFIER void
+pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+ GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
+ float** GPU_FUNC_ARGUMENT(h_grids),
+ bool GPU_FUNC_ARGUMENT(computeSplines),
+ bool GPU_FUNC_ARGUMENT(spreadCharges),
+ real GPU_FUNC_ARGUMENT(lambda),
+ const bool GPU_FUNC_ARGUMENT(useGpuDirectComm),
+ gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
/*! \libinternal \brief
* 3D FFT R2C/C2R routine.
*
* \param[in] pmeGpu The PME GPU structure.
* \param[in] direction Transform direction (real-to-complex or complex-to-real)
- * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex);
+void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex = 0);
/*! \libinternal \brief
* A GPU Fourier space solving function.
*
* \param[in] pmeGpu The PME GPU structure.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
* \param[in,out] h_grid The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
* \param[in] gridOrdering Specifies the dimenion ordering of the complex grid. TODO: store this information?
- * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should also be performed.
+ * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed.
*/
GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+ int GPU_FUNC_ARGUMENT(gridIndex),
t_complex* GPU_FUNC_ARGUMENT(h_grid),
GridOrdering GPU_FUNC_ARGUMENT(gridOrdering),
bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM;
/*! \libinternal \brief
* A GPU force gathering function.
*
- * \param[in] pmeGpu The PME GPU structure.
- * reductions. \param[in] h_grid The host-side grid buffer (used only in testing mode)
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] h_grids The host-side grid buffer (used only in testing mode).
+ * \param[in] lambda The lambda value to use.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
- const float* GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+ float** GPU_FUNC_ARGUMENT(h_grids),
+ float GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
+
/*! \brief Sets the device pointer to coordinate data
* \param[in] pmeGpu The PME GPU structure.
* \param[in] pmeGpu The PME GPU structure.
* \returns Pointer to force data
*/
-GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
- GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
-/*! \brief Return pointer to GPU stream.
- * \param[in] pmeGpu The PME GPU structure.
- * \returns Pointer to stream object.
- */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
- GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
-/*! \brief Return pointer to GPU context (for OpenCL builds).
- * \param[in] pmeGpu The PME GPU structure.
- * \returns Pointer to context object.
- */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
- GPU_FUNC_TERM_WITH_RETURN(nullptr);
+GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+ GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
* \param[in] pmeGpu The PME GPU structure.
* handled the solve stage.
*
* \param[in] pme The PME structure.
+ * \param[in] lambda The lambda value to use when calculating the results.
* \param[out] output Pointer to output where energy and virial should be stored.
*/
GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
+ float GPU_FUNC_ARGUMENT(lambda),
PmeOutput* GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM;
/*! \libinternal \brief
* Returns the GPU outputs (forces, energy and virial)
*
- * \param[in] pme The PME structure.
- * \param[in] flags The combination of flags that affected this PME computation.
- * The flags are the GMX_PME_ flags from pme.h.
- * \returns The output object.
+ * \param[in] pme The PME structure.
+ * \param[in] computeEnergyAndVirial Whether the energy and virial are being computed
+ * \param[in] lambdaQ The Coulomb lambda to use when finalizing the output.
+ * \returns The output object.
*/
GPU_FUNC_QUALIFIER PmeOutput pme_gpu_getOutput(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
- int GPU_FUNC_ARGUMENT(flags))
+ bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
+ real GPU_FUNC_ARGUMENT(lambdaQ))
GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
/*! \libinternal \brief
*/
void pme_gpu_finish_computation(const PmeGpu* pmeGpu);
-//! A binary enum for spline data layout transformation
-enum class PmeLayoutTransform
-{
- GpuToHost,
- HostToGpu
-};
-
-/*! \libinternal \brief
- * Rearranges the atom spline data between the GPU and host layouts.
- * Only used for test purposes so far, likely to be horribly slow.
- *
- * \param[in] pmeGpu The PME GPU structure.
- * \param[out] atc The PME CPU atom data structure (with a single-threaded layout).
- * \param[in] type The spline data type (values or derivatives).
- * \param[in] dimIndex Dimension index.
- * \param[in] transform Layout transform type
- */
-GPU_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
- const PmeAtomComm* GPU_FUNC_ARGUMENT(atc),
- PmeSplineDataType GPU_FUNC_ARGUMENT(type),
- int GPU_FUNC_ARGUMENT(dimIndex),
- PmeLayoutTransform GPU_FUNC_ARGUMENT(transform)) GPU_FUNC_TERM;
-
-/*! \libinternal \brief
- * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
- * which is laid out for GPU spread/gather kernels. The index is wrt the execution block,
- * in range(0, atomsPerBlock * order * DIM).
- * This is a wrapper, only used in unit tests.
- * \param[in] order PME order
- * \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
- * \param[in] dimIndex Dimension index (from 0 to 2)
- * \param[in] atomIndex Atom index wrt the block.
- * \param[in] atomsPerWarp Number of atoms processed by a warp.
- *
- * \returns Index into theta or dtheta array using GPU layout.
- */
-int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp);
-
/*! \libinternal \brief
* Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
*
/*! \libinternal \brief
* (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
*
- * \param[in,out] pme The PME structure.
- * \param[in] deviceInfo The GPU device information structure.
- * \param[in] pmeGpuProgram The PME GPU program data
+ * \param[in,out] pme The PME structure.
+ * \param[in] deviceContext The GPU context.
+ * \param[in] deviceStream The GPU stream.
+ * \param[in,out] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ *
* \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
- const DeviceInformation* GPU_FUNC_ARGUMENT(deviceInfo),
+GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
+ const DeviceContext* GPU_FUNC_ARGUMENT(deviceContext),
+ const DeviceStream* GPU_FUNC_ARGUMENT(deviceStream),
const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
/*! \libinternal \brief
*
* \param[in] pmeGpu The PME GPU structure.
* \param[in] nAtoms The number of particles.
- * \param[in] charges The pointer to the host-side array of particle charges.
+ * \param[in] chargesA The pointer to the host-side array of particle charges in the unperturbed state or FEP state A.
+ * \param[in] chargesB The pointer to the host-side array of particle charges in FEP state B.
*
* This is a function that should only be called in the beginning of the run and on domain
* decomposition. Should be called before the pme_gpu_set_io_ranges.
*/
GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
int GPU_FUNC_ARGUMENT(nAtoms),
- const real* GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM;
+ const real* GPU_FUNC_ARGUMENT(chargesA),
+ const real* GPU_FUNC_ARGUMENT(chargesB) = nullptr) GPU_FUNC_TERM;
/*! \brief \libinternal
* The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
* Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
* (if they were to be computed).
*
- * \param[in] pme The PME data structure.
- * \param[in] flags The combination of flags to affect this PME computation.
- * The flags are the GMX_PME_ flags from pme.h.
- * \param[out] wcycle The wallclock counter.
- * \return The output forces, energy and virial
+ * \param[in] pme The PME data structure.
+ * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed.
+ * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
+ * \param[out] wcycle The wallclock counter.
+ * \return The output forces, energy and virial
*/
-GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
- int GPU_FUNC_ARGUMENT(flags),
+GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
+ bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
+ real GPU_FUNC_ARGUMENT(lambdaQ),
gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle))
GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});