Merge release-2021 into master

[alexxy/gromacs.git] / src / gromacs / ewald / pme.h
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index bfc79b88e99bfa5ef037fdf64e519b6da385f539..eb0c5651753535db601959d2101f7f8e12669ebd 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -4,7 +4,7 @@
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
   * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -50,12 +50,11 @@
  #define GMX_EWALD_PME_H
  
  #include <string>
+#include <vector>
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpu_macros.h"
  #include "gromacs/math/vectypes.h"
-#include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/real.h"
  
  struct gmx_hw_info_t;
@@ -64,7 +63,6 @@ struct t_inputrec;
  struct t_nrnb;
  struct PmeGpu;
  struct gmx_wallclock_gpu_pme_t;
-struct DeviceInformation;
  struct gmx_enerdata_t;
  struct gmx_mtop_t;
  struct gmx_pme_t;
@@ -77,12 +75,51 @@ enum class GpuTaskCompletion;
  class PmeGpuProgram;
  class GpuEventSynchronizer;
  
+/*! \brief Hack to selectively enable some parts of PME during unit testing.
+ *
+ * Set to \c false by default. If any of the tests sets it to \c true, it will
+ * make the compatibility check consider PME to be supported in SYCL builds.
+ *
+ * Currently we don't have proper PME implementation with SYCL, but we still want
+ * to run tests for some of the kernels.
+ *
+ * \todo Remove after #3927 is done and PME is fully enabled in SYCL builds.
+ */
+//NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+extern bool g_allowPmeWithSyclForTesting;
+
  namespace gmx
  {
+template<typename>
+class ArrayRef;
  class ForceWithVirial;
  class MDLogger;
  enum class PinningPolicy : int;
  class StepWorkload;
+
+/*! \libinternal \brief Class for managing usage of separate PME-only ranks
+ *
+ * Used for checking if some parts of the code could not use PME-only ranks
+ *
+ */
+class SeparatePmeRanksPermitted
+{
+public:
+    //! Disables PME ranks permitted flag with a reason
+    void disablePmeRanks(const std::string& reason);
+    //! Return status of PME ranks usage
+    bool permitSeparatePmeRanks() const;
+    //! Returns all reasons, for not using PME ranks
+    std::string reasonsWhyDisabled() const;
+
+private:
+    //! Flag that informs whether simualtion could use dedicated PME ranks
+    bool permitSeparatePmeRanks_ = true;
+    //! Storage for all reasons, why PME ranks could not be used
+    std::vector<std::string> reasons_;
+};
+
+class PmeCoordinateReceiverGpu;
  } // namespace gmx
  
  enum
@@ -135,22 +172,23 @@ bool gmx_pme_check_restrictions(int  pme_order,
   * \todo We should evolve something like a \c GpuManager that holds \c
   * DeviceInformation* and \c PmeGpuProgram* and perhaps other
   * related things whose lifetime can/should exceed that of a task (or
- * perhaps task manager). See Redmine #2522.
+ * perhaps task manager). See Issue #2522.
   */
-gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
-                        const NumPmeDomains&     numPmeDomains,
-                        const t_inputrec*        ir,
-                        gmx_bool                 bFreeEnergy_q,
-                        gmx_bool                 bFreeEnergy_lj,
-                        gmx_bool                 bReproducible,
-                        real                     ewaldcoeff_q,
-                        real                     ewaldcoeff_lj,
-                        int                      nthread,
-                        PmeRunMode               runMode,
-                        PmeGpu*                  pmeGpu,
-                        const DeviceInformation* deviceInfo,
-                        const PmeGpuProgram*     pmeGpuProgram,
-                        const gmx::MDLogger&     mdlog);
+gmx_pme_t* gmx_pme_init(const t_commrec*     cr,
+                        const NumPmeDomains& numPmeDomains,
+                        const t_inputrec*    ir,
+                        gmx_bool             bFreeEnergy_q,
+                        gmx_bool             bFreeEnergy_lj,
+                        gmx_bool             bReproducible,
+                        real                 ewaldcoeff_q,
+                        real                 ewaldcoeff_lj,
+                        int                  nthread,
+                        PmeRunMode           runMode,
+                        PmeGpu*              pmeGpu,
+                        const DeviceContext* deviceContext,
+                        const DeviceStream*  deviceStream,
+                        const PmeGpuProgram* pmeGpuProgram,
+                        const gmx::MDLogger& mdlog);
  
  /*! \brief As gmx_pme_init, but takes most settings, except the grid/Ewald coefficients, from
   * pme_src. This is only called when the PME cut-off/grid size changes.
@@ -180,12 +218,12 @@ void gmx_pme_destroy(gmx_pme_t* pme);
  int gmx_pme_do(struct gmx_pme_t*              pme,
                 gmx::ArrayRef<const gmx::RVec> coordinates,
                 gmx::ArrayRef<gmx::RVec>       forces,
-               real                           chargeA[],
-               real                           chargeB[],
-               real                           c6A[],
-               real                           c6B[],
-               real                           sigmaA[],
-               real                           sigmaB[],
+               gmx::ArrayRef<const real>      chargeA,
+               gmx::ArrayRef<const real>      chargeB,
+               gmx::ArrayRef<const real>      c6A,
+               gmx::ArrayRef<const real>      c6B,
+               gmx::ArrayRef<const real>      sigmaA,
+               gmx::ArrayRef<const real>      sigmaB,
                 const matrix                   box,
                 const t_commrec*               cr,
                 int                            maxshift_x,
@@ -209,7 +247,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
   * pme struct. Currently does not work in parallel or with free
   * energy.
   */
-void gmx_pme_calc_energy(gmx_pme_t* pme, gmx::ArrayRef<const gmx::RVec> x, gmx::ArrayRef<const real> q, real* V);
+real gmx_pme_calc_energy(gmx_pme_t* pme, gmx::ArrayRef<const gmx::RVec> x, gmx::ArrayRef<const real> q);
  
  /*! \brief
   * This function updates the local atom data on GPU after DD (charges, coordinates, etc.).
@@ -218,9 +256,15 @@ void gmx_pme_calc_energy(gmx_pme_t* pme, gmx::ArrayRef<const gmx::RVec> x, gmx::
   *
   * \param[in,out] pme        The PME structure.
   * \param[in]     numAtoms   The number of particles.
- * \param[in]     charges    The pointer to the array of particle charges.
+ * \param[in]     chargesA   The pointer to the array of particle charges in the normal state or FEP
+ * state A. Can be nullptr if PME is not performed on the GPU.
+ * \param[in]     chargesB   The pointer to the array of particle charges in state B. Only used if
+ * charges are perturbed and can otherwise be nullptr.
   */
-void gmx_pme_reinit_atoms(gmx_pme_t* pme, int numAtoms, const real* charges);
+void gmx_pme_reinit_atoms(gmx_pme_t*                pme,
+                          int                       numAtoms,
+                          gmx::ArrayRef<const real> chargesA,
+                          gmx::ArrayRef<const real> chargesB);
  
  /* A block of PME GPU functions */
  
@@ -250,12 +294,22 @@ bool pme_gpu_supports_hardware(const gmx_hw_info_t& hwinfo, std::string* error);
   * formed gmx_pme_t structure. Should that one go away/work with inputrec?
   *
   * \param[in]  ir     Input system.
- * \param[in]  mtop   Complete system topology to check if an FE simulation perturbs charges.
   * \param[out] error  If non-null, the error message if the input is not supported on GPU.
   *
   * \returns true if PME can run on GPU with this input, false otherwise.
   */
-bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::string* error);
+bool pme_gpu_supports_input(const t_inputrec& ir, std::string* error);
+
+/*! \brief Checks whether the input system allows to run PME on GPU in Mixed mode.
+ * Assumes that the input system is compatible with GPU PME otherwise, that is,
+ * before calling this function one should check that \ref pme_gpu_supports_input returns \c true.
+ *
+ * \param[in]  ir     Input system.
+ * \param[out] error  If non-null, the error message if the input is not supported.
+ *
+ * \returns true if PME can run on GPU in Mixed mode with this input, false otherwise.
+ */
+bool pme_gpu_mixed_mode_supports_input(const t_inputrec& ir, std::string* error);
  
  /*! \brief
   * Returns the active PME codepath (CPU, GPU, mixed).
@@ -331,13 +385,24 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t*     GPU_FUNC_ARGU
  /*! \brief
   * Launches first stage of PME on GPU - spreading kernel.
   *
- * \param[in] pme                The PME data structure.
- * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks.
- * \param[in] wcycle             The wallclock counter.
+ * \param[in] pme                            The PME data structure.
+ * \param[in] xReadyOnDevice                 Event synchronizer indicating that the coordinates
+ *                                           are ready in the device memory; nullptr allowed only
+ *                                           on separate PME ranks.
+ * \param[in] wcycle                         The wallclock counter.
+ * \param[in] lambdaQ                        The Coulomb lambda of the current state of the
+ *                                           system. Only used if FEP of Coulomb is active.
+ * \param[in] useGpuDirectComm               Whether direct GPU PME-PP communication is active
+ * \param[in]  pmeCoordinateReceiverGpu      Coordinate receiver object, which must be valid when
+ *                                           direct GPU PME-PP communication is active
   */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t*            GPU_FUNC_ARGUMENT(pme),
-                                              GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
-                                              gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(
+        gmx_pme_t*                     GPU_FUNC_ARGUMENT(pme),
+        GpuEventSynchronizer*          GPU_FUNC_ARGUMENT(xReadyOnDevice),
+        gmx_wallcycle*                 GPU_FUNC_ARGUMENT(wcycle),
+        real                           GPU_FUNC_ARGUMENT(lambdaQ),
+        bool                           GPU_FUNC_ARGUMENT(useGpuDirectComm),
+        gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
  
  /*! \brief
   * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
@@ -354,11 +419,13 @@ pme_gpu_launch_complex_transforms(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme
  /*! \brief
   * Launches last stage of PME on GPU - force gathering and D2H force transfer.
   *
- * \param[in]  pme               The PME data structure.
- * \param[in]  wcycle            The wallclock counter.
+ * \param[in] pme               The PME data structure.
+ * \param[in] wcycle            The wallclock counter.
+ * \param[in] lambdaQ           The Coulomb lambda to use when calculating the results.
   */
  GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
-                                              gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+                                              gmx_wallcycle*   GPU_FUNC_ARGUMENT(wcycle),
+                                              real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
  
  /*! \brief
   * Attempts to complete PME GPU tasks.
@@ -375,6 +442,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t* GPU_FUNC_ARGUMENT
   * \param[in]  wcycle          The wallclock counter.
   * \param[out] forceWithVirial The output force and virial
   * \param[out] enerd           The output energies
+ * \param[in]  lambdaQ         The Coulomb lambda to use when calculating the results.
   * \param[in]  completionKind  Indicates whether PME task completion should only be checked rather
   *                             than waited for
   * \returns                    True if the PME GPU tasks have completed
@@ -384,6 +452,7 @@ GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(gmx_pme_t*               GPU_FUN
                                                  gmx_wallcycle*           GPU_FUNC_ARGUMENT(wcycle),
                                                  gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial),
                                                  gmx_enerdata_t*       GPU_FUNC_ARGUMENT(enerd),
+                                                real                  GPU_FUNC_ARGUMENT(lambdaQ),
                                                  GpuTaskCompletion GPU_FUNC_ARGUMENT(completionKind))
          GPU_FUNC_TERM_WITH_RETURN(false);
  
@@ -396,12 +465,14 @@ GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(gmx_pme_t*               GPU_FUN
   * \param[in]  wcycle          The wallclock counter.
   * \param[out] forceWithVirial The output force and virial
   * \param[out] enerd           The output energies
+ * \param[in]  lambdaQ         The Coulomb lambda to use when calculating the results.
   */
  GPU_FUNC_QUALIFIER void pme_gpu_wait_and_reduce(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme),
                                                  const gmx::StepWorkload& GPU_FUNC_ARGUMENT(stepWork),
                                                  gmx_wallcycle*           GPU_FUNC_ARGUMENT(wcycle),
                                                  gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial),
-                                                gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM;
+                                                gmx_enerdata_t*       GPU_FUNC_ARGUMENT(enerd),
+                                                real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
  
  /*! \brief
   * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
@@ -430,19 +501,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
   * \param[in] pme            The PME data structure.
   * \returns                  Pointer to force data
   */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
-/*! \brief Returns the pointer to the GPU stream.
- *  \param[in] pme            The PME data structure.
- *  \returns                  Pointer to GPU stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
+GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
  
  /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
   * \param[in] pme            The PME data structure.
- * \returns                  Pointer to sychronizer
+ * \returns                  Pointer to synchronizer
   */
  GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);