From f7be07e3cc901eb03700d93248fc09b573370282 Mon Sep 17 00:00:00 2001
From: Magnus Lundborg <magnus.lundborg@scilifelab.se>
Date: Mon, 20 Jul 2020 10:17:02 +0000
Subject: [PATCH] Two sets of coefficients for Coulomb FEP PME on GPU

The first patch in a series to enable running Coulomb FEP PME on GPU.
Use two sets of coefficients to store atom charges.

Refs #2054, #3117

Change-Id: Iab6eb7ac766800f7c045dc5a00069e77509d391f
---
 docs/release-notes/2021/major/performance.rst |   5 +
 docs/user-guide/mdrun-performance.rst         |   3 -
 src/gromacs/domdec/mdsetup.cpp                |   2 +-
 src/gromacs/ewald/pme.cpp                     |  22 +-
 src/gromacs/ewald/pme.h                       |  32 +-
 src/gromacs/ewald/pme_gather.clh              | 240 ++++++--
 src/gromacs/ewald/pme_gather.cu               | 295 ++++++---
 src/gromacs/ewald/pme_gpu.cpp                 | 104 ++--
 src/gromacs/ewald/pme_gpu_3dfft.cu            |   6 +-
 src/gromacs/ewald/pme_gpu_3dfft.h             |   3 +-
 src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp       |   6 +-
 .../ewald/pme_gpu_calculate_splines.cuh       |  15 +-
 src/gromacs/ewald/pme_gpu_internal.cpp        | 579 +++++++++++++-----
 src/gromacs/ewald/pme_gpu_internal.h          |  94 ++-
 src/gromacs/ewald/pme_gpu_program_impl.cu     | 114 ++--
 src/gromacs/ewald/pme_gpu_program_impl.h      |  54 +-
 .../ewald/pme_gpu_program_impl_ocl.cpp        | 105 +++-
 src/gromacs/ewald/pme_gpu_staging.h           |   9 +-
 src/gromacs/ewald/pme_gpu_types.h             |  20 +-
 src/gromacs/ewald/pme_gpu_types_host.h        |   2 +-
 src/gromacs/ewald/pme_gpu_types_host_impl.h   |  21 +-
 src/gromacs/ewald/pme_only.cpp                |  10 +-
 src/gromacs/ewald/pme_output.h                |   4 +
 src/gromacs/ewald/pme_program.cl              | 115 +++-
 src/gromacs/ewald/pme_solve.clh               |   8 +-
 src/gromacs/ewald/pme_solve.cu                |  42 +-
 src/gromacs/ewald/pme_spread.clh              |  40 +-
 src/gromacs/ewald/pme_spread.cu               |  75 ++-
 src/gromacs/ewald/tests/pmetestcommon.cpp     |  47 +-
 src/gromacs/mdlib/mdatoms.cpp                 |  30 +-
 src/gromacs/mdlib/mdatoms.h                   |  20 +-
 src/gromacs/mdlib/sim_util.cpp                |  30 +-
 src/gromacs/mdrun/runner.cpp                  |   4 +-
 src/gromacs/mdrun/tpi.cpp                     |   2 +-
 src/gromacs/taskassignment/decidegpuusage.cpp |   7 +-
 src/gromacs/taskassignment/decidegpuusage.h   |   4 -
 .../taskassignment/resourcedivision.cpp       |   2 +-
 37 files changed, 1540 insertions(+), 631 deletions(-)

diff --git a/docs/release-notes/2021/major/performance.rst b/docs/release-notes/2021/major/performance.rst
index 0931f3843f..d486d0c308 100644
--- a/docs/release-notes/2021/major/performance.rst
+++ b/docs/release-notes/2021/major/performance.rst
@@ -20,3 +20,8 @@ The time `gmx grompp` spent processing distance restraint has been
 changed from quadratic in the number of restraints to linear.
        
 :issue:`3457`
+
+Support for offloading PME to GPU when doing Coulomb FEP
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+PME calculations can be offloaded to GPU when doing Coulomb free-energy perturbations.
diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst
index d14e4fe43d..1309ac0806 100644
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -1093,9 +1093,6 @@ Known limitations
 
 - Only single precision is supported.
 
-- Free energy calculations where charges are perturbed are not supported,
-  because only single PME grids can be calculated.
-
 - Only dynamical integrators are supported (ie. leap-frog, Velocity Verlet,
   stochastic dynamics)
 
diff --git a/src/gromacs/domdec/mdsetup.cpp b/src/gromacs/domdec/mdsetup.cpp
index 2c8027b7f2..9c71dd410c 100644
--- a/src/gromacs/domdec/mdsetup.cpp
+++ b/src/gromacs/domdec/mdsetup.cpp
@@ -137,7 +137,7 @@ void mdAlgorithmsSetupAtomData(const t_commrec*        cr,
          * For PME-only ranks, gmx_pmeonly() has its own call to gmx_pme_reinit_atoms().
          */
         const int numPmeAtoms = numHomeAtoms - fr->n_tpi;
-        gmx_pme_reinit_atoms(fr->pmedata, numPmeAtoms, mdatoms->chargeA);
+        gmx_pme_reinit_atoms(fr->pmedata, numPmeAtoms, mdatoms->chargeA, mdatoms->chargeB);
     }
 
     if (constr)
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index 0f59ebe63e..f26c2f6fe3 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -179,7 +179,7 @@ bool pme_gpu_supports_hardware(const gmx_hw_info_t gmx_unused& hwinfo, std::stri
     return addMessageIfNotSupported(errorReasons, error);
 }
 
-bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::string* error)
+bool pme_gpu_supports_input(const t_inputrec& ir, std::string* error)
 {
     std::list<std::string> errorReasons;
     if (!EEL_PME(ir.coulombtype))
@@ -190,14 +190,6 @@ bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::s
     {
         errorReasons.emplace_back("interpolation orders other than 4");
     }
-    if (ir.efep != efepNO)
-    {
-        if (gmx_mtop_has_perturbed_charges(mtop))
-        {
-            errorReasons.emplace_back(
-                    "free energy calculations with perturbed charges (multiple grids)");
-        }
-    }
     if (EVDW_PME(ir.vdwtype))
     {
         errorReasons.emplace_back("Lennard-Jones PME");
@@ -231,10 +223,6 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t* pme, std::string* error)
     {
         errorReasons.emplace_back("interpolation orders other than 4");
     }
-    if (pme->bFEP)
-    {
-        errorReasons.emplace_back("free energy calculations (multiple grids)");
-    }
     if (pme->doLJ)
     {
         errorReasons.emplace_back("Lennard-Jones PME");
@@ -939,7 +927,7 @@ void gmx_pme_reinit(struct gmx_pme_t** pmedata,
          */
         if (!pme_src->gpu && pme_src->nnodes == 1)
         {
-            gmx_pme_reinit_atoms(*pmedata, pme_src->atc[0].numAtoms(), nullptr);
+            gmx_pme_reinit_atoms(*pmedata, pme_src->atc[0].numAtoms(), nullptr, nullptr);
         }
         // TODO this is mostly passing around current values
     }
@@ -1703,11 +1691,13 @@ void gmx_pme_destroy(gmx_pme_t* pme)
     delete pme;
 }
 
-void gmx_pme_reinit_atoms(gmx_pme_t* pme, const int numAtoms, const real* charges)
+void gmx_pme_reinit_atoms(gmx_pme_t* pme, const int numAtoms, const real* chargesA, const real* chargesB)
 {
     if (pme->gpu != nullptr)
     {
-        pme_gpu_reinit_atoms(pme->gpu, numAtoms, charges);
+        GMX_ASSERT(!(pme->bFEP_q && chargesB == nullptr),
+                   "B state charges must be specified if running Coulomb FEP on the GPU");
+        pme_gpu_reinit_atoms(pme->gpu, numAtoms, chargesA, pme->bFEP_q ? chargesB : nullptr);
     }
     else
     {
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index d60f2e6326..0408b87f77 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -219,9 +219,12 @@ void gmx_pme_calc_energy(gmx_pme_t* pme, gmx::ArrayRef<const gmx::RVec> x, gmx::
  *
  * \param[in,out] pme        The PME structure.
  * \param[in]     numAtoms   The number of particles.
- * \param[in]     charges    The pointer to the array of particle charges.
+ * \param[in]     chargesA   The pointer to the array of particle charges in the normal state or FEP
+ * state A. Can be nullptr if PME is not performed on the GPU.
+ * \param[in]     chargesB   The pointer to the array of particle charges in state B. Only used if
+ * charges are perturbed and can otherwise be nullptr.
  */
-void gmx_pme_reinit_atoms(gmx_pme_t* pme, int numAtoms, const real* charges);
+void gmx_pme_reinit_atoms(gmx_pme_t* pme, int numAtoms, const real* chargesA, const real* chargesB);
 
 /* A block of PME GPU functions */
 
@@ -251,12 +254,11 @@ bool pme_gpu_supports_hardware(const gmx_hw_info_t& hwinfo, std::string* error);
  * formed gmx_pme_t structure. Should that one go away/work with inputrec?
  *
  * \param[in]  ir     Input system.
- * \param[in]  mtop   Complete system topology to check if an FE simulation perturbs charges.
  * \param[out] error  If non-null, the error message if the input is not supported on GPU.
  *
  * \returns true if PME can run on GPU with this input, false otherwise.
  */
-bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::string* error);
+bool pme_gpu_supports_input(const t_inputrec& ir, std::string* error);
 
 /*! \brief
  * Returns the active PME codepath (CPU, GPU, mixed).
@@ -333,12 +335,16 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t*     GPU_FUNC_ARGU
  * Launches first stage of PME on GPU - spreading kernel.
  *
  * \param[in] pme                The PME data structure.
- * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks.
+ * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates
+ * are ready in the device memory; nullptr allowed only on separate PME ranks.
  * \param[in] wcycle             The wallclock counter.
+ * \param[in] lambdaQ            The Coulomb lambda of the current state of the
+ * system. Only used if FEP of Coulomb is active.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t*            GPU_FUNC_ARGUMENT(pme),
                                               GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
-                                              gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+                                              gmx_wallcycle*        GPU_FUNC_ARGUMENT(wcycle),
+                                              const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
 
 /*! \brief
  * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
@@ -355,11 +361,13 @@ pme_gpu_launch_complex_transforms(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme
 /*! \brief
  * Launches last stage of PME on GPU - force gathering and D2H force transfer.
  *
- * \param[in]  pme               The PME data structure.
- * \param[in]  wcycle            The wallclock counter.
+ * \param[in] pme               The PME data structure.
+ * \param[in] wcycle            The wallclock counter.
+ * \param[in] lambdaQ           The Coulomb lambda to use when calculating the results.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
-                                              gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+                                              gmx_wallcycle*   GPU_FUNC_ARGUMENT(wcycle),
+                                              const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
 
 /*! \brief
  * Attempts to complete PME GPU tasks.
@@ -376,6 +384,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t* GPU_FUNC_ARGUMENT
  * \param[in]  wcycle          The wallclock counter.
  * \param[out] forceWithVirial The output force and virial
  * \param[out] enerd           The output energies
+ * \param[in]  lambdaQ         The Coulomb lambda to use when calculating the results.
  * \param[in]  completionKind  Indicates whether PME task completion should only be checked rather
  *                             than waited for
  * \returns                    True if the PME GPU tasks have completed
@@ -385,6 +394,7 @@ GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(gmx_pme_t*               GPU_FUN
                                                 gmx_wallcycle*           GPU_FUNC_ARGUMENT(wcycle),
                                                 gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial),
                                                 gmx_enerdata_t*       GPU_FUNC_ARGUMENT(enerd),
+                                                const real            GPU_FUNC_ARGUMENT(lambdaQ),
                                                 GpuTaskCompletion GPU_FUNC_ARGUMENT(completionKind))
         GPU_FUNC_TERM_WITH_RETURN(false);
 
@@ -397,12 +407,14 @@ GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(gmx_pme_t*               GPU_FUN
  * \param[in]  wcycle          The wallclock counter.
  * \param[out] forceWithVirial The output force and virial
  * \param[out] enerd           The output energies
+ * \param[in]  lambdaQ         The Coulomb lambda to use when calculating the results.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_wait_and_reduce(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme),
                                                 const gmx::StepWorkload& GPU_FUNC_ARGUMENT(stepWork),
                                                 gmx_wallcycle*           GPU_FUNC_ARGUMENT(wcycle),
                                                 gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial),
-                                                gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM;
+                                                gmx_enerdata_t*       GPU_FUNC_ARGUMENT(enerd),
+                                                const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
 
 /*! \brief
  * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh
index 2d51020763..8c7106acf9 100644
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -184,6 +184,86 @@ inline void reduce_atom_forces(__local float* __restrict__ sm_forces,
     }
 }
 
+/*! \brief Calculate the sum of the force partial components (in X, Y and Z)
+ *
+ * \param[out] fx                 The force partial component in the X dimension.
+ * \param[out] fy                 The force partial component in the Y dimension.
+ * \param[out] fz                 The force partial component in the Z dimension.
+ * \param[in] ixBase              The grid line index base value in the X dimension.
+ * \param[in] nx                  The grid real size in the X dimension.
+ * \param[in] pny                 The padded grid real size in the Y dimension.
+ * \param[in] pnz                 The padded grid real size in the Z dimension.
+ * \param[in] constOffset         The offset to calculate the global grid index.
+ * \param[in] splineIndexBase     The base value of the spline parameter index.
+ * \param[in] tdy                 The theta and dtheta in the Y dimension.
+ * \param[in] tdz                 The theta and dtheta in the Z dimension.
+ * \param[in] sm_splineParams     Shared memory array of spline parameters.
+ * \param[in] gm_grid             Global memory array of the grid to use.
+ */
+inline void sumForceComponents(float*        fx,
+                               float*        fy,
+                               float*        fz,
+                               const int     ixBase,
+                               const int     nx,
+                               const int     pny,
+                               const int     pnz,
+                               const int     constOffset,
+                               const int     splineIndexBase,
+                               const float2  tdy,
+                               const float2  tdz,
+                               __local const float2* __restrict__ sm_splineParams,
+                               __global const float* __restrict__ gm_grid)
+{
+#    pragma unroll order
+    for (int ithx = 0; (ithx < order); ithx++)
+    {
+        int ix = ixBase + ithx;
+        if (wrapX & (ix >= nx))
+        {
+            ix -= nx;
+        }
+        const int gridIndexGlobal = ix * pny * pnz + constOffset;
+        assert(gridIndexGlobal >= 0);
+        const float gridValue = gm_grid[gridIndexGlobal];
+        assert(isfinite(gridValue));
+        const int    splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
+        const float2 tdx          = sm_splineParams[splineIndexX];
+        const float  fxy1         = tdz.x * gridValue;
+        const float  fz1          = tdz.y * gridValue;
+        *fx += tdx.y * tdy.x * fxy1;
+        *fy += tdx.x * tdy.y * fxy1;
+        *fz += tdx.x * tdy.x * fz1;
+    }
+}
+
+/*! \brief Calculate the grid forces and store them in shared memory.
+ *
+ * \param[in,out] sm_forces       Shared memory array with the output forces.
+ * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
+ * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
+ * \param[in] recipBox            The reciprocal box.
+ * \param[in] scale               The scale to use when calculating the forces (only relevant when
+ * using two grids).
+ * \param[in] gm_coefficients     Global memory array of the coefficients to use.
+ */
+inline void calculateAndStoreGridForces(__local float* __restrict__ sm_forces,
+                                        const int   forceIndexLocal,
+                                        const int   forceIndexGlobal,
+                                        const float recipBox[DIM][DIM],
+                                        const float scale,
+                                        __global const float* __restrict__ gm_coefficients)
+{
+    const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
+    float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
+    float3       result;
+    result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
+    result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
+    result.z = negCoefficient
+               * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
+                  + recipBox[ZZ][ZZ] * atomForces.z);
+    vstore3(result, forceIndexLocal, sm_forces);
+}
+
 #endif // COMPILE_GATHER_HELPERS_ONCE
 
 /*! \brief
@@ -192,8 +272,15 @@ inline void reduce_atom_forces(__local float* __restrict__ sm_forces,
  * Please see the file description for additional defines which this kernel expects.
  *
  * \param[in]     kernelParams         All the PME GPU data.
- * \param[in]     gm_coefficients      Atom charges/coefficients.
- * \param[in]     gm_grid              Global 3D grid.
+ * \param[in]     gm_coefficientsA     Atom charges/coefficients in the unperturbed state, or FEP
+ * state A.
+ * \param[in]     gm_coefficientsB     Atom charges/coefficients in FEP state B. Only used
+ * when spreading interpolated coefficients on one grid or spreading two sets of coefficients on two
+ * separate grids.
+ * \param[in]     gm_gridA             Global 3D grid for the unperturbed state, FEP
+ * state A or the single grid used for interpolated coefficients on one grid in FEP A/B.
+ * \param[in]     gm_gridB             Global 3D grid for FEP state B when using dual
+ * grids (when calculating energy and virials).
  * \param[in]     gm_theta             Atom spline parameter values
  * \param[in]     gm_dtheta            Atom spline parameter derivatives
  * \param[in]     gm_gridlineIndices   Atom gridline indices (ivec)
@@ -201,13 +288,17 @@ inline void reduce_atom_forces(__local float* __restrict__ sm_forces,
  */
 __attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
 __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
-                                                        __global const float* __restrict__ gm_coefficients,
-                                                        __global const float* __restrict__ gm_grid,
+                                                        __global const float* __restrict__ gm_coefficientsA,
+                                                        __global const float* __restrict__ gm_coefficientsB,
+                                                        __global const float* __restrict__ gm_gridA,
+                                                        __global const float* __restrict__ gm_gridB,
                                                         __global const float* __restrict__ gm_theta,
                                                         __global const float* __restrict__ gm_dtheta,
                                                         __global const int* __restrict__ gm_gridlineIndices,
                                                         __global float* __restrict__ gm_forces)
 {
+    assert(numGrids == 1 || numGrids == 2);
+
     /* These are the atom indices - for the shared and global memory */
     const int atomIndexLocal  = get_local_id(ZZ);
     const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
@@ -267,58 +358,40 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     float fy = 0.0F;
     float fz = 0.0F;
 
-    const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
+    int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
 
-    if (chargeCheck)
+    const int nx  = kernelParams.grid.realGridSize[XX];
+    const int ny  = kernelParams.grid.realGridSize[YY];
+    const int nz  = kernelParams.grid.realGridSize[ZZ];
+    const int pny = kernelParams.grid.realGridSizePadded[YY];
+    const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
+
+    const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+    const int warpIndex     = atomIndexLocal / atomsPerWarp;
+
+    const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
+    const int    splineIndexY    = getSplineParamIndex(splineIndexBase, YY, ithy);
+    const float2 tdy             = sm_splineParams[splineIndexY];
+    const int    splineIndexZ    = getSplineParamIndex(splineIndexBase, ZZ, ithz);
+    const float2 tdz             = sm_splineParams[splineIndexZ];
+
+    const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
+    int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+    if (wrapY & (iy >= ny))
     {
-        const int nx  = kernelParams.grid.realGridSize[XX];
-        const int ny  = kernelParams.grid.realGridSize[YY];
-        const int nz  = kernelParams.grid.realGridSize[ZZ];
-        const int pny = kernelParams.grid.realGridSizePadded[YY];
-        const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
-
-        const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
-        const int warpIndex     = atomIndexLocal / atomsPerWarp;
-
-        const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
-        const int    splineIndexY    = getSplineParamIndex(splineIndexBase, YY, ithy);
-        const float2 tdy             = sm_splineParams[splineIndexY];
-        const int    splineIndexZ    = getSplineParamIndex(splineIndexBase, ZZ, ithz);
-        const float2 tdz             = sm_splineParams[splineIndexZ];
-
-        const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
-        int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
-        if (wrapY & (iy >= ny))
-        {
-            iy -= ny;
-        }
-        int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
-        if (iz >= nz)
-        {
-            iz -= nz;
-        }
-        const int constOffset = iy * pnz + iz;
+        iy -= ny;
+    }
+    int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+    if (iz >= nz)
+    {
+        iz -= nz;
+    }
+    const int constOffset = iy * pnz + iz;
 
-#pragma unroll order
-        for (int ithx = 0; (ithx < order); ithx++)
-        {
-            int ix = ixBase + ithx;
-            if (wrapX & (ix >= nx))
-            {
-                ix -= nx;
-            }
-            const int gridIndexGlobal = ix * pny * pnz + constOffset;
-            assert(gridIndexGlobal >= 0);
-            const float gridValue = gm_grid[gridIndexGlobal];
-            assert(isfinite(gridValue));
-            const int    splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
-            const float2 tdx          = sm_splineParams[splineIndexX];
-            const float  fxy1         = tdz.x * gridValue;
-            const float  fz1          = tdz.y * gridValue;
-            fx += tdx.y * tdy.x * fxy1;
-            fy += tdx.x * tdy.y * fxy1;
-            fz += tdx.x * tdy.x * fz1;
-        }
+    if (chargeCheck)
+    {
+        sumForceComponents(&fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy,
+                           tdz, sm_splineParams, gm_gridA);
     }
 
     // Reduction of partial force contributions
@@ -332,22 +405,13 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     barrier(CLK_LOCAL_MEM_FENCE);
 
     /* Calculating the final forces with no component branching, atomsPerBlock threads */
-    const int forceIndexLocal  = threadLocalId;
-    const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+    const int   forceIndexLocal  = threadLocalId;
+    const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+    const float scale            = kernelParams.current.scale;
     if (forceIndexLocal < atomsPerBlock)
     {
-        const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
-        const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
-        float3       result;
-        result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
-        result.y = negCoefficient
-                   * (kernelParams.current.recipBox[XX][YY] * atomForces.x
-                      + kernelParams.current.recipBox[YY][YY] * atomForces.y);
-        result.z = negCoefficient
-                   * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x
-                      + kernelParams.current.recipBox[YY][ZZ] * atomForces.y
-                      + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
-        vstore3(result, forceIndexLocal, sm_forces);
+        calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+                                    kernelParams.current.recipBox, scale, gm_coefficientsA);
     }
 
 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
@@ -374,4 +438,46 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
             gm_forces[outputIndexGlobal]     = outputForceComponent;
         }
     }
+
+    if (numGrids == 2)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        fx          = 0.0f;
+        fy          = 0.0f;
+        fz          = 0.0f;
+        chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
+        if (chargeCheck)
+        {
+            sumForceComponents(&fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase,
+                               tdy, tdz, sm_splineParams, gm_gridB);
+        }
+        reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, lineIndex,
+                           kernelParams.grid.realGridSizeFP, fx, fy, fz, sm_forceReduction, sm_forceTemp);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (forceIndexLocal < atomsPerBlock)
+        {
+            calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+                                        kernelParams.current.recipBox, 1.0F - scale, gm_coefficientsB);
+        }
+
+#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
+        /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
+         * __syncwarp() in CUDA. #2519
+         */
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+        /* Writing or adding the final forces component-wise, single warp */
+        if (threadLocalId < iterThreads)
+        {
+#pragma unroll
+            for (int i = 0; i < numIter; i++)
+            {
+                const int outputIndexLocal  = i * iterThreads + threadLocalId;
+                const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
+                const float outputForceComponent = sm_forces[outputIndexLocal];
+                gm_forces[outputIndexGlobal] += outputForceComponent;
+            }
+        }
+    }
 }
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index 2edec547ef..52814c4ce4 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -213,6 +213,119 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
     }
 }
 
+/*! \brief Calculate the sum of the force partial components (in X, Y and Z)
+ *
+ * \tparam[in] order              The PME order (must be 4).
+ * \tparam[in] atomsPerWarp       The number of atoms per GPU warp.
+ * \tparam[in] wrapX              Tells if the grid is wrapped in the X dimension.
+ * \tparam[in] wrapY              Tells if the grid is wrapped in the Y dimension.
+ * \param[out] fx                 The force partial component in the X dimension.
+ * \param[out] fy                 The force partial component in the Y dimension.
+ * \param[out] fz                 The force partial component in the Z dimension.
+ * \param[in] ithyMin             The thread minimum index in the Y dimension.
+ * \param[in] ithyMax             The thread maximum index in the Y dimension.
+ * \param[in] ixBase              The grid line index base value in the X dimension.
+ * \param[in] iz                  The grid line index in the Z dimension.
+ * \param[in] nx                  The grid real size in the X dimension.
+ * \param[in] ny                  The grid real size in the Y dimension.
+ * \param[in] pny                 The padded grid real size in the Y dimension.
+ * \param[in] pnz                 The padded grid real size in the Z dimension.
+ * \param[in] atomIndexLocal      The atom index for this thread.
+ * \param[in] splineIndexBase     The base value of the spline parameter index.
+ * \param[in] tdz                 The theta and dtheta in the Z dimension.
+ * \param[in] sm_gridlineIndices  Shared memory array of grid line indices.
+ * \param[in] sm_theta            Shared memory array of atom theta values.
+ * \param[in] sm_dtheta           Shared memory array of atom dtheta values.
+ * \param[in] gm_grid             Global memory array of the grid to use.
+ */
+template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
+__device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
+                                                   float* __restrict__ fy,
+                                                   float* __restrict__ fz,
+                                                   const int    ithyMin,
+                                                   const int    ithyMax,
+                                                   const int    ixBase,
+                                                   const int    iz,
+                                                   const int    nx,
+                                                   const int    ny,
+                                                   const int    pny,
+                                                   const int    pnz,
+                                                   const int    atomIndexLocal,
+                                                   const int    splineIndexBase,
+                                                   const float2 tdz,
+                                                   const int* __restrict__ sm_gridlineIndices,
+                                                   const float* __restrict__ sm_theta,
+                                                   const float* __restrict__ sm_dtheta,
+                                                   const float* __restrict__ gm_grid)
+{
+#pragma unroll
+    for (int ithy = ithyMin; ithy < ithyMax; ithy++)
+    {
+        const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+        const float2 tdy       = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
+
+        int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+        if (wrapY & (iy >= ny))
+        {
+            iy -= ny;
+        }
+        const int constOffset = iy * pnz + iz;
+
+#pragma unroll
+        for (int ithx = 0; (ithx < order); ithx++)
+        {
+            int ix = ixBase + ithx;
+            if (wrapX & (ix >= nx))
+            {
+                ix -= nx;
+            }
+            const int gridIndexGlobal = ix * pny * pnz + constOffset;
+            assert(gridIndexGlobal >= 0);
+            const float gridValue = gm_grid[gridIndexGlobal];
+            assert(isfinite(gridValue));
+            const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+            const float2 tdx       = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
+            const float  fxy1      = tdz.x * gridValue;
+            const float  fz1       = tdz.y * gridValue;
+            *fx += tdx.y * tdy.x * fxy1;
+            *fy += tdx.x * tdy.y * fxy1;
+            *fz += tdx.x * tdy.x * fz1;
+        }
+    }
+}
+
+/*! \brief Calculate the grid forces and store them in shared memory.
+ *
+ * \param[in,out] sm_forces       Shared memory array with the output forces.
+ * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
+ * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
+ * \param[in] recipBox            The reciprocal box.
+ * \param[in] scale               The scale to use when calculating the forces. For gm_coefficientsB
+ * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
+ * \param[in] gm_coefficients     Global memory array of the coefficients to use for an unperturbed
+ * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
+ * separate grids are used this should be the coefficients of the grid in question.
+ * \param[in] gm_coefficientsB    Global memory array of the coefficients to use for FEP in state B.
+ * Should be nullptr if two separate grids are used.
+ */
+__device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
+                                                            const int   forceIndexLocal,
+                                                            const int   forceIndexGlobal,
+                                                            const float recipBox[DIM][DIM],
+                                                            const float scale,
+                                                            const float* __restrict__ gm_coefficients)
+{
+    const float3 atomForces     = sm_forces[forceIndexLocal];
+    float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
+    float3       result;
+    result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
+    result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
+    result.z = negCoefficient
+               * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
+                  + recipBox[ZZ][ZZ] * atomForces.z);
+    sm_forces[forceIndexLocal] = result;
+}
+
 /*! \brief
  * A CUDA kernel which gathers the atom forces from the grid.
  * The grid is assumed to be wrapped in dimension Z.
@@ -220,19 +333,24 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
  * \tparam[in] order                The PME order (must be 4 currently).
  * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
  * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
+ * \tparam[in] numGrids             The number of grids to use in the kernel. Can be 1 or 2.
  * \tparam[in] readGlobal           Tells if we should read spline values from global memory
  * \tparam[in] threadsPerAtom       How many threads work on each atom
  *
  * \param[in]  kernelParams         All the PME GPU data.
  */
-template<int order, bool wrapX, bool wrapY, bool readGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
         void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
 {
+    assert(numGrids == 1 || numGrids == 2);
+
     /* Global memory pointers */
-    const float* __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients;
-    const float* __restrict__ gm_grid         = kernelParams.grid.d_realGrid;
-    float* __restrict__ gm_forces             = kernelParams.atoms.d_forces;
+    const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0];
+    const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
+    const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
+    const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
+    float* __restrict__ gm_forces              = kernelParams.atoms.d_forces;
 
     /* Global memory pointers for readGlobal */
     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
@@ -328,7 +446,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
             // Coordinates
             __shared__ float3 sm_coordinates[atomsPerBlock];
             /* Staging coefficients/charges */
-            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficients);
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficientsA);
 
             /* Staging coordinates */
             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
@@ -339,7 +457,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
         else
         {
             atomX      = gm_coordinates[atomIndexGlobal];
-            atomCharge = gm_coefficients[atomIndexGlobal];
+            atomCharge = gm_coefficientsA[atomIndexGlobal];
         }
         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
@@ -349,70 +467,37 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     float fy = 0.0f;
     float fz = 0.0f;
 
-    const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
-
-    if (chargeCheck)
-    {
-        const int nx  = kernelParams.grid.realGridSize[XX];
-        const int ny  = kernelParams.grid.realGridSize[YY];
-        const int nz  = kernelParams.grid.realGridSize[ZZ];
-        const int pny = kernelParams.grid.realGridSizePadded[YY];
-        const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
-
-        const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
-        const int warpIndex     = atomIndexLocal / atomsPerWarp;
+    const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
 
-        const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
-        const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
-        const float2 tdz       = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
+    const int nx  = kernelParams.grid.realGridSize[XX];
+    const int ny  = kernelParams.grid.realGridSize[YY];
+    const int nz  = kernelParams.grid.realGridSize[ZZ];
+    const int pny = kernelParams.grid.realGridSizePadded[YY];
+    const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
 
-        int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
-        const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
+    const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+    const int warpIndex     = atomIndexLocal / atomsPerWarp;
 
-        if (iz >= nz)
-        {
-            iz -= nz;
-        }
-        int constOffset, iy;
+    const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+    const int    splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
+    const float2 tdz          = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
 
-        const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
-        const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
-        for (int ithy = ithyMin; ithy < ithyMax; ithy++)
-        {
-            const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
-            const float2 tdy       = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
+    int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+    const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 
-            iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
-            if (wrapY & (iy >= ny))
-            {
-                iy -= ny;
-            }
-            constOffset = iy * pnz + iz;
-
-#pragma unroll
-            for (int ithx = 0; (ithx < order); ithx++)
-            {
-                int ix = ixBase + ithx;
-                if (wrapX & (ix >= nx))
-                {
-                    ix -= nx;
-                }
-                const int gridIndexGlobal = ix * pny * pnz + constOffset;
-                assert(gridIndexGlobal >= 0);
-                const float gridValue = gm_grid[gridIndexGlobal];
-                assert(isfinite(gridValue));
-                const int splineIndexX =
-                        getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
-                const float2 tdx  = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
-                const float  fxy1 = tdz.x * gridValue;
-                const float  fz1  = tdz.y * gridValue;
-                fx += tdx.y * tdy.x * fxy1;
-                fy += tdx.x * tdy.y * fxy1;
-                fz += tdx.x * tdy.x * fz1;
-            }
-        }
+    if (iz >= nz)
+    {
+        iz -= nz;
     }
 
+    const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
+    const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
+    if (chargeCheck)
+    {
+        sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(
+                &fx, &fy, &fz, ithyMin, ithyMax, ixBase, iz, nx, ny, pny, pnz, atomIndexLocal,
+                splineIndexBase, tdz, sm_gridlineIndices, sm_theta, sm_dtheta, gm_gridA);
+    }
     // Reduction of partial force contributions
     __shared__ float3 sm_forces[atomsPerBlock];
     reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces, atomIndexLocal, splineIndex, lineIndex,
@@ -420,22 +505,13 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     __syncthreads();
 
     /* Calculating the final forces with no component branching, atomsPerBlock threads */
-    const int forceIndexLocal  = threadLocalId;
-    const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+    const int   forceIndexLocal  = threadLocalId;
+    const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+    const float scale            = kernelParams.current.scale;
     if (forceIndexLocal < atomsPerBlock)
     {
-        const float3 atomForces     = sm_forces[forceIndexLocal];
-        const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
-        float3       result;
-        result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
-        result.y = negCoefficient
-                   * (kernelParams.current.recipBox[XX][YY] * atomForces.x
-                      + kernelParams.current.recipBox[YY][YY] * atomForces.y);
-        result.z = negCoefficient
-                   * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x
-                      + kernelParams.current.recipBox[YY][ZZ] * atomForces.y
-                      + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
-        sm_forces[forceIndexLocal] = result;
+        calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+                                    kernelParams.current.recipBox, scale, gm_coefficientsA);
     }
 
     __syncwarp();
@@ -450,18 +526,65 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
 #pragma unroll
         for (int i = 0; i < numIter; i++)
         {
-            int         outputIndexLocal     = i * iterThreads + threadLocalId;
-            int         outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
-            const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
-            gm_forces[outputIndexGlobal]     = outputForceComponent;
+            int   outputIndexLocal       = i * iterThreads + threadLocalId;
+            int   outputIndexGlobal      = blockIndex * blockForcesSize + outputIndexLocal;
+            float outputForceComponent   = ((float*)sm_forces)[outputIndexLocal];
+            gm_forces[outputIndexGlobal] = outputForceComponent;
+        }
+    }
+
+    if (numGrids == 2)
+    {
+        /* We must sync here since the same shared memory is used as above. */
+        __syncthreads();
+        fx                    = 0.0f;
+        fy                    = 0.0f;
+        fz                    = 0.0f;
+        const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
+        if (chargeCheck)
+        {
+            sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(
+                    &fx, &fy, &fz, ithyMin, ithyMax, ixBase, iz, nx, ny, pny, pnz, atomIndexLocal,
+                    splineIndexBase, tdz, sm_gridlineIndices, sm_theta, sm_dtheta, gm_gridB);
+        }
+        // Reduction of partial force contributions
+        reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces, atomIndexLocal, splineIndex,
+                                                           lineIndex, kernelParams.grid.realGridSizeFP,
+                                                           fx, fy, fz);
+        __syncthreads();
+
+        /* Calculating the final forces with no component branching, atomsPerBlock threads */
+        if (forceIndexLocal < atomsPerBlock)
+        {
+            calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+                                        kernelParams.current.recipBox, 1.0F - scale, gm_coefficientsB);
+        }
+
+        __syncwarp();
+
+        /* Writing or adding the final forces component-wise, single warp */
+        if (threadLocalId < iterThreads)
+        {
+#pragma unroll
+            for (int i = 0; i < numIter; i++)
+            {
+                int   outputIndexLocal     = i * iterThreads + threadLocalId;
+                int   outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
+                float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+                gm_forces[outputIndexGlobal] += outputForceComponent;
+            }
         }
     }
 }
 
 //! Kernel instantiations
 // clang-format off
-template __global__ void pme_gather_kernel<4, true, true, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, true, true, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, true, true, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, true, true, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-// clang-format on
\ No newline at end of file
+template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+// clang-format on
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index a5f54f004d..bcac411467 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -125,7 +125,6 @@ void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t*             pme,
                                                enum gmx_fft_direction dir,
                                                gmx_wallcycle_t        wcycle)
 {
-    GMX_ASSERT(gridIndex == 0, "Only single grid supported");
     if (pme_gpu_settings(pme->gpu).performGPUFFT)
     {
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
@@ -190,32 +189,41 @@ void pme_gpu_prepare_computation(gmx_pme_t*               pme,
     }
 }
 
-void pme_gpu_launch_spread(gmx_pme_t* pme, GpuEventSynchronizer* xReadyOnDevice, gmx_wallcycle* wcycle)
+void pme_gpu_launch_spread(gmx_pme_t*            pme,
+                           GpuEventSynchronizer* xReadyOnDevice,
+                           gmx_wallcycle*        wcycle,
+                           const real            lambdaQ)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
     GMX_ASSERT(xReadyOnDevice || !pme->bPPnode || (GMX_GPU != GMX_GPU_CUDA),
                "Need a valid xReadyOnDevice on PP+PME ranks with CUDA.");
+    GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
 
     PmeGpu* pmeGpu = pme->gpu;
 
-    const unsigned int gridIndex = 0;
-    real*              fftgrid   = pme->fftgrid[gridIndex];
+    GMX_ASSERT(pmeGpu->common->ngrids == 1 || (pmeGpu->common->ngrids == 2 && pme->bFEP_q),
+               "If not decoupling Coulomb interactions there should only be one FEP grid. If "
+               "decoupling Coulomb interactions there should be two grids.");
+
+    /* PME on GPU can currently manage two grids:
+     * grid_index=0: Coulomb PME with charges in the normal state or from FEP state A.
+     * grid_index=1: Coulomb PME with charges from FEP state B.
+     */
+    real** fftgrids = pme->fftgrid;
     /* Spread the coefficients on a grid */
     const bool computeSplines = true;
     const bool spreadCharges  = true;
     wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
     wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
-    pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+    pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
 
 void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork)
 {
-    PmeGpu*            pmeGpu    = pme->gpu;
-    const auto&        settings  = pmeGpu->settings;
-    const unsigned int gridIndex = 0;
-    t_complex*         cfftgrid  = pme->cfftgrid[gridIndex];
+    PmeGpu*     pmeGpu   = pme->gpu;
+    const auto& settings = pmeGpu->settings;
     // There's no support for computing energy without virial, or vice versa
     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
     if (!settings.performGPUFFT)
@@ -227,37 +235,44 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, co
 
     try
     {
-        /* do R2C 3D-FFT */
-        parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
-
-        /* solve in k-space for our local cells */
-        if (settings.performGPUSolve)
-        {
-            // TODO grid ordering should be set up at pme init time.
-            const auto gridOrdering = settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ;
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
-            pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-        }
-        else
+        /* The 3dffts and the solve are done in a loop to simplify things, even if this means that
+         * there will be two kernel launches for solve. */
+        for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
         {
-            wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
-#pragma omp parallel for num_threads(pme->nthread) schedule(static)
-            for (int thread = 0; thread < pme->nthread; thread++)
+            /* do R2C 3D-FFT */
+            t_complex* cfftgrid = pme->cfftgrid[gridIndex];
+            parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
+
+            /* solve in k-space for our local cells */
+            if (settings.performGPUSolve)
             {
-                solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread);
+                const auto gridOrdering =
+                        settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ;
+                wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+                wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+                pme_gpu_solve(pmeGpu, gridIndex, cfftgrid, gridOrdering, computeEnergyAndVirial);
+                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+                wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+            }
+            else
+            {
+                wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+                for (int thread = 0; thread < pme->nthread; thread++)
+                {
+                    solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial,
+                                  pme->nthread, thread);
+                }
+                wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
             }
-            wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
-        }
 
-        parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
+            parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
+        }
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 }
 
-void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycle)
+void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycle, const real lambdaQ)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
@@ -268,9 +283,9 @@ void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycl
 
     wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
     wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
-    const unsigned int gridIndex = 0;
-    real*              fftgrid   = pme->fftgrid[gridIndex];
-    pme_gpu_gather(pme->gpu, reinterpret_cast<float*>(fftgrid));
+
+    float** fftgrids = pme->fftgrid;
+    pme_gpu_gather(pme->gpu, fftgrids, lambdaQ);
     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
@@ -303,6 +318,7 @@ static void pme_gpu_reduce_outputs(const bool            computeEnergyAndVirial,
         GMX_ASSERT(enerd, "Invalid energy output manager");
         forceWithVirial->addVirialContribution(output.coulombVirial_);
         enerd->term[F_COUL_RECIP] += output.coulombEnergy_;
+        enerd->dvdl_lin[efptCOUL] += output.coulombDvdl_;
     }
     if (output.haveForceOutput_)
     {
@@ -316,6 +332,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
                              gmx_wallcycle*           wcycle,
                              gmx::ForceWithVirial*    forceWithVirial,
                              gmx_enerdata_t*          enerd,
+                             const real               lambdaQ,
                              GpuTaskCompletion        completionKind)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
@@ -355,7 +372,8 @@ bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
     pme_gpu_update_timings(pme->gpu);
     // There's no support for computing energy without virial, or vice versa
     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
-    PmeOutput  output                 = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+    PmeOutput  output                 = pme_gpu_getOutput(*pme, computeEnergyAndVirial,
+                                         pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 
     GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
@@ -366,7 +384,10 @@ bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
 }
 
 // This is used by PME-only ranks
-PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, const bool computeEnergyAndVirial, gmx_wallcycle* wcycle)
+PmeOutput pme_gpu_wait_finish_task(gmx_pme_t*     pme,
+                                   const bool     computeEnergyAndVirial,
+                                   const real     lambdaQ,
+                                   gmx_wallcycle* wcycle)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
@@ -380,7 +401,8 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, const bool computeEnergyAndVi
         pme_gpu_synchronize(pme->gpu);
     }
 
-    PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+    PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial,
+                                         pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
     return output;
 }
@@ -390,11 +412,13 @@ void pme_gpu_wait_and_reduce(gmx_pme_t*               pme,
                              const gmx::StepWorkload& stepWork,
                              gmx_wallcycle*           wcycle,
                              gmx::ForceWithVirial*    forceWithVirial,
-                             gmx_enerdata_t*          enerd)
+                             gmx_enerdata_t*          enerd,
+                             const real               lambdaQ)
 {
     // There's no support for computing energy without virial, or vice versa
     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
-    PmeOutput  output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, wcycle);
+    PmeOutput  output                 = pme_gpu_wait_finish_task(
+            pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0, wcycle);
     GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
                "When forces are reduced on the CPU, there needs to be force output");
     pme_gpu_reduce_outputs(computeEnergyAndVirial, output, wcycle, forceWithVirial, enerd);
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu
index 9f9578962e..ec928f745a 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft.cu
+++ b/src/gromacs/ewald/pme_gpu_3dfft.cu
@@ -60,7 +60,7 @@ static void handleCufftError(cufftResult_t status, const char* msg)
     }
 }
 
-GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
+GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex)
 {
     const PmeGpuCudaKernelParams* kernelParamsPtr = pmeGpu->kernelParams.get();
     ivec                          realGridSize, realGridSizePadded, complexGridSizePadded;
@@ -79,9 +79,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
     const int realGridSizePaddedTotal =
             realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ];
 
-    realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid;
+    realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid[gridIndex];
     GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid");
-    complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid;
+    complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid[gridIndex];
     GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid");
 
     cufftResult_t result;
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.h b/src/gromacs/ewald/pme_gpu_3dfft.h
index fc6d67a935..96cfa9ed6a 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft.h
+++ b/src/gromacs/ewald/pme_gpu_3dfft.h
@@ -73,8 +73,9 @@ public:
      * Constructs CUDA/OpenCL FFT plans for performing 3D FFT on a PME grid.
      *
      * \param[in] pmeGpu                  The PME GPU structure.
+     * \param[in] gridIndex               The index of the grid on which to perform the calculations.
      */
-    GpuParallel3dFft(const PmeGpu* pmeGpu);
+    GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex);
     /*! \brief Destroys the FFT plans. */
     ~GpuParallel3dFft();
     /*! \brief Performs the FFT transform in given direction
diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
index b341a27b82..4ba6649497 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
@@ -63,7 +63,7 @@ static void handleClfftError(clfftStatus status, const char* msg)
     }
 }
 
-GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
+GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex)
 {
     // Extracting all the data from PME GPU
     std::array<size_t, DIM> realGridSize, realGridSizePadded, complexGridSizePadded;
@@ -82,8 +82,8 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
     }
     cl_context context = pmeGpu->archSpecific->deviceContext_.context();
     deviceStreams_.push_back(pmeGpu->archSpecific->pmeStream_.stream());
-    realGrid_                       = kernelParamsPtr->grid.d_realGrid;
-    complexGrid_                    = kernelParamsPtr->grid.d_fourierGrid;
+    realGrid_                       = kernelParamsPtr->grid.d_realGrid[gridIndex];
+    complexGrid_                    = kernelParamsPtr->grid.d_fourierGrid[gridIndex];
     const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
 
 
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
index bfdc3fc0e7..05649b600a 100644
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -44,6 +44,7 @@
 #include <cassert>
 
 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
 
 #include "pme.cuh"
 #include "pme_grid.h"
@@ -139,13 +140,13 @@ __device__ inline void assertIsFinite(T arg)
 /*! \brief
  * General purpose function for loading atom-related data from global to shared memory.
  *
- * \tparam[in] T                 Data type (float/int/...)
- * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be
- *                               accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for
- *                               an rvec coordinates array).
- * \param[out] sm_destination    Shared memory array for output.
- * \param[in]  gm_source         Global memory array for input.
+ * \tparam[in] T                  Data type (float/int/...)
+ * \tparam[in] atomsPerBlock      Number of atoms processed by a block - should be accounted for in
+ * the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom   Number of data elements per single atom (e.g. DIM for an rvec
+ * coordinates array).
+ * \param[out] sm_destination     Shared memory array for output.
+ * \param[in]  gm_source          Global memory array for input.
  */
 template<typename T, int atomsPerBlock, int dataCountPerAtom>
 __device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_destination,
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index 63b77aa86f..635f0f1a62 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -133,26 +133,46 @@ void pme_gpu_synchronize(const PmeGpu* pmeGpu)
 void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
 {
     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
-    allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount,
-                         pmeGpu->archSpecific->deviceContext_);
-    pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
+
+    GMX_ASSERT(
+            pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+            "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex],
+                             c_virialAndEnergyCount, pmeGpu->archSpecific->deviceContext_);
+        pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy[gridIndex]), energyAndVirialSize);
+    }
 }
 
 void pme_gpu_free_energy_virial(PmeGpu* pmeGpu)
 {
-    freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy);
-    pfree(pmeGpu->staging.h_virialAndEnergy);
-    pmeGpu->staging.h_virialAndEnergy = nullptr;
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex]);
+        pfree(pmeGpu->staging.h_virialAndEnergy[gridIndex]);
+        pmeGpu->staging.h_virialAndEnergy[gridIndex] = nullptr;
+    }
 }
 
 void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
 {
-    clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
-                           c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex], 0,
+                               c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
+    }
 }
 
-void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
+void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, const int gridIndex)
 {
+    GMX_ASSERT(
+            pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+            "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+    GMX_ASSERT(gridIndex < pmeGpu->common->ngrids,
+               "Invalid combination of gridIndex and number of grids");
+
     const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX],
                                           pmeGpu->kernelParams->grid.realGridSize[XX]
                                                   + pmeGpu->kernelParams->grid.realGridSize[YY] };
@@ -161,33 +181,36 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
     const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX]
                                     + pmeGpu->kernelParams->grid.realGridSize[YY]
                                     + pmeGpu->kernelParams->grid.realGridSize[ZZ];
-    const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
-                           &pmeGpu->archSpecific->splineValuesSize,
-                           &pmeGpu->archSpecific->splineValuesSizeAlloc,
+    const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize[gridIndex]);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
+                           newSplineValuesSize, &pmeGpu->archSpecific->splineValuesSize[gridIndex],
+                           &pmeGpu->archSpecific->splineValuesCapacity[gridIndex],
                            pmeGpu->archSpecific->deviceContext_);
     if (shouldRealloc)
     {
         /* Reallocate the host buffer */
-        pfree(pmeGpu->staging.h_splineModuli);
-        pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_splineModuli),
+        pfree(pmeGpu->staging.h_splineModuli[gridIndex]);
+        pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_splineModuli[gridIndex]),
                 newSplineValuesSize * sizeof(float));
     }
     for (int i = 0; i < DIM; i++)
     {
-        memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i],
+        memcpy(pmeGpu->staging.h_splineModuli[gridIndex] + splineValuesOffset[i],
                pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
     }
     /* TODO: pin original buffer instead! */
-    copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
-                       0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream_,
-                       pmeGpu->settings.transferKind, nullptr);
+    copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
+                       pmeGpu->staging.h_splineModuli[gridIndex], 0, newSplineValuesSize,
+                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
 }
 
 void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
 {
-    pfree(pmeGpu->staging.h_splineModuli);
-    freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        pfree(pmeGpu->staging.h_splineModuli[gridIndex]);
+        freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex]);
+    }
 }
 
 void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
@@ -224,16 +247,18 @@ void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
                          pmeGpu->settings.transferKind, nullptr);
 }
 
-void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients)
+void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
+                                                 const float*  h_coefficients,
+                                                 const int     gridIndex)
 {
     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
-                           &pmeGpu->archSpecific->coefficientsSize,
-                           &pmeGpu->archSpecific->coefficientsSizeAlloc,
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
+                           newCoefficientsSize, &pmeGpu->archSpecific->coefficientsSize[gridIndex],
+                           &pmeGpu->archSpecific->coefficientsCapacity[gridIndex],
                            pmeGpu->archSpecific->deviceContext_);
-    copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
+    copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
                        const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
                        pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
 
@@ -241,14 +266,17 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
     const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
     if (paddingCount > 0)
     {
-        clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex], paddingIndex,
                                paddingCount, pmeGpu->archSpecific->pmeStream_);
     }
 }
 
 void pme_gpu_free_coefficients(const PmeGpu* pmeGpu)
 {
-    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex]);
+    }
 }
 
 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
@@ -304,51 +332,65 @@ void pme_gpu_free_grid_indices(const PmeGpu* pmeGpu)
 
 void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
 {
-    auto*     kernelParamsPtr = pmeGpu->kernelParams.get();
+    auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+
     const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX]
                                 * kernelParamsPtr->grid.realGridSizePadded[YY]
                                 * kernelParamsPtr->grid.realGridSizePadded[ZZ];
     const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX]
                                    * kernelParamsPtr->grid.complexGridSizePadded[YY]
                                    * kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
-    // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
-    if (pmeGpu->archSpecific->performOutOfPlaceFFT)
-    {
-        /* 2 separate grids */
-        reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
-                               &pmeGpu->archSpecific->complexGridSize,
-                               &pmeGpu->archSpecific->complexGridSizeAlloc,
-                               pmeGpu->archSpecific->deviceContext_);
-        reallocateDeviceBuffer(
-                &kernelParamsPtr->grid.d_realGrid, newRealGridSize, &pmeGpu->archSpecific->realGridSize,
-                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
-    }
-    else
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
     {
-        /* A single buffer so that any grid will fit */
-        const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
-        reallocateDeviceBuffer(
-                &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize,
-                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
-        kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
-        pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
-        // the size might get used later for copying the grid
+        // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
+        if (pmeGpu->archSpecific->performOutOfPlaceFFT)
+        {
+            /* 2 separate grids */
+            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], newComplexGridSize,
+                                   &pmeGpu->archSpecific->complexGridSize[gridIndex],
+                                   &pmeGpu->archSpecific->complexGridCapacity[gridIndex],
+                                   pmeGpu->archSpecific->deviceContext_);
+            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newRealGridSize,
+                                   &pmeGpu->archSpecific->realGridSize[gridIndex],
+                                   &pmeGpu->archSpecific->realGridCapacity[gridIndex],
+                                   pmeGpu->archSpecific->deviceContext_);
+        }
+        else
+        {
+            /* A single buffer so that any grid will fit */
+            const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
+            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newGridsSize,
+                                   &pmeGpu->archSpecific->realGridSize[gridIndex],
+                                   &pmeGpu->archSpecific->realGridCapacity[gridIndex],
+                                   pmeGpu->archSpecific->deviceContext_);
+            kernelParamsPtr->grid.d_fourierGrid[gridIndex] = kernelParamsPtr->grid.d_realGrid[gridIndex];
+            pmeGpu->archSpecific->complexGridSize[gridIndex] =
+                    pmeGpu->archSpecific->realGridSize[gridIndex];
+            // the size might get used later for copying the grid
+        }
     }
 }
 
 void pme_gpu_free_grids(const PmeGpu* pmeGpu)
 {
-    if (pmeGpu->archSpecific->performOutOfPlaceFFT)
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
     {
-        freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
+        if (pmeGpu->archSpecific->performOutOfPlaceFFT)
+        {
+            freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid[gridIndex]);
+        }
+        freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex]);
     }
-    freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
 }
 
 void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
 {
-    clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0,
-                           pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_);
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+                               pmeGpu->archSpecific->realGridSize[gridIndex],
+                               pmeGpu->archSpecific->pmeStream_);
+    }
 }
 
 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
@@ -395,17 +437,18 @@ bool pme_gpu_stream_query(const PmeGpu* pmeGpu)
     return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream_);
 }
 
-void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid)
+void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, const int gridIndex)
 {
-    copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize,
+    copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], h_grid, 0,
+                       pmeGpu->archSpecific->realGridSize[gridIndex],
                        pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
 }
 
-void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
+void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, const int gridIndex)
 {
-    copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0,
-                         pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_,
-                         pmeGpu->settings.transferKind, nullptr);
+    copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+                         pmeGpu->archSpecific->realGridSize[gridIndex],
+                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_);
 }
 
@@ -488,9 +531,9 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
     if (pme_gpu_settings(pmeGpu).performGPUFFT)
     {
         pmeGpu->archSpecific->fftSetup.resize(0);
-        for (int i = 0; i < pmeGpu->common->ngrids; i++)
+        for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
         {
-            pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu));
+            pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu, gridIndex));
         }
     }
 }
@@ -500,26 +543,62 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu)
     pmeGpu->archSpecific->fftSetup.resize(0);
 }
 
-void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, PmeOutput* output)
+void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, const float lambda, PmeOutput* output)
 {
     const PmeGpu* pmeGpu = pme.gpu;
-    for (int j = 0; j < c_virialAndEnergyCount; j++)
-    {
-        GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[j]),
-                   "PME GPU produces incorrect energy/virial.");
-    }
-
-    unsigned int j                 = 0;
-    output->coulombVirial_[XX][XX] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
-    output->coulombVirial_[YY][YY] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
-    output->coulombVirial_[ZZ][ZZ] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
-    output->coulombVirial_[XX][YY] = output->coulombVirial_[YY][XX] =
-            0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
-    output->coulombVirial_[XX][ZZ] = output->coulombVirial_[ZZ][XX] =
-            0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
-    output->coulombVirial_[YY][ZZ] = output->coulombVirial_[ZZ][YY] =
-            0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
-    output->coulombEnergy_ = 0.5F * pmeGpu->staging.h_virialAndEnergy[j++];
+
+    GMX_ASSERT(lambda == 1.0 || pmeGpu->common->ngrids == 2,
+               "Invalid combination of lambda and number of grids");
+
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        for (int j = 0; j < c_virialAndEnergyCount; j++)
+        {
+            GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[gridIndex][j]),
+                       "PME GPU produces incorrect energy/virial.");
+        }
+    }
+    for (int dim1 = 0; dim1 < DIM; dim1++)
+    {
+        for (int dim2 = 0; dim2 < DIM; dim2++)
+        {
+            output->coulombVirial_[dim1][dim2] = 0;
+        }
+    }
+    output->coulombEnergy_ = 0;
+    float scale            = 1.0;
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        if (pmeGpu->common->ngrids == 2)
+        {
+            scale = gridIndex == 0 ? (1.0 - lambda) : lambda;
+        }
+        output->coulombVirial_[XX][XX] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][0];
+        output->coulombVirial_[YY][YY] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][1];
+        output->coulombVirial_[ZZ][ZZ] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][2];
+        output->coulombVirial_[XX][YY] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][3];
+        output->coulombVirial_[YY][XX] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][3];
+        output->coulombVirial_[XX][ZZ] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][4];
+        output->coulombVirial_[ZZ][XX] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][4];
+        output->coulombVirial_[YY][ZZ] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][5];
+        output->coulombVirial_[ZZ][YY] +=
+                scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][5];
+        output->coulombEnergy_ += scale * 0.5F * pmeGpu->staging.h_virialAndEnergy[gridIndex][6];
+    }
+    if (pmeGpu->common->ngrids > 1)
+    {
+        output->coulombDvdl_ = 0.5F
+                               * (pmeGpu->staging.h_virialAndEnergy[FEP_STATE_B][6]
+                                  - pmeGpu->staging.h_virialAndEnergy[FEP_STATE_A][6]);
+    }
 }
 
 /*! \brief Sets the force-related members in \p output
@@ -536,7 +615,7 @@ static void pme_gpu_getForceOutput(PmeGpu* pmeGpu, PmeOutput* output)
     }
 }
 
-PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVirial)
+PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVirial, const real lambdaQ)
 {
     PmeGpu* pmeGpu = pme.gpu;
 
@@ -548,7 +627,7 @@ PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVir
     {
         if (pme_gpu_settings(pmeGpu).performGPUSolve)
         {
-            pme_gpu_getEnergyAndVirial(pme, &output);
+            pme_gpu_getEnergyAndVirial(pme, lambdaQ, &output);
         }
         else
         {
@@ -590,9 +669,13 @@ void pme_gpu_update_input_box(PmeGpu gmx_unused* pmeGpu, const matrix gmx_unused
 static void pme_gpu_reinit_grids(PmeGpu* pmeGpu)
 {
     auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
+
+    GMX_ASSERT(
+            pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+            "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
     kernelParamsPtr->grid.ewaldFactor =
             (M_PI * M_PI) / (pmeGpu->common->ewaldcoeff_q * pmeGpu->common->ewaldcoeff_q);
-
     /* The grid size variants */
     for (int i = 0; i < DIM; i++)
     {
@@ -613,14 +696,17 @@ static void pme_gpu_reinit_grids(PmeGpu* pmeGpu)
         kernelParamsPtr->grid.realGridSizePadded[ZZ] =
                 (kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2;
     }
-
     /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
     kernelParamsPtr->grid.complexGridSize[ZZ] /= 2;
     kernelParamsPtr->grid.complexGridSize[ZZ]++;
     kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ];
 
     pme_gpu_realloc_and_copy_fract_shifts(pmeGpu);
-    pme_gpu_realloc_and_copy_bspline_values(pmeGpu);
+    for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+    {
+        pme_gpu_realloc_and_copy_bspline_values(pmeGpu, gridIndex);
+    }
+
     pme_gpu_realloc_grids(pmeGpu);
     pme_gpu_reinit_3dfft(pmeGpu);
 }
@@ -641,7 +727,7 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t* pme)
     /* TODO: Consider refactoring the CPU PME code to use the same structure,
      * so that this function becomes 2 lines */
     PmeGpu* pmeGpu               = pme->gpu;
-    pmeGpu->common->ngrids       = pme->ngrids;
+    pmeGpu->common->ngrids       = pme->bFEP_q ? 2 : 1;
     pmeGpu->common->epsilon_r    = pme->epsilon_r;
     pmeGpu->common->ewaldcoeff_q = pme->ewaldcoeff_q;
     pmeGpu->common->nk[XX]       = pme->nkx;
@@ -725,9 +811,9 @@ static void pme_gpu_init(gmx_pme_t*           pme,
     pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
 
     pme_gpu_init_internal(pmeGpu, deviceContext, deviceStream);
-    pme_gpu_alloc_energy_virial(pmeGpu);
 
     pme_gpu_copy_common_data_from(pme);
+    pme_gpu_alloc_energy_virial(pmeGpu);
 
     GMX_ASSERT(pmeGpu->common->epsilon_r != 0.0F, "PME GPU: bad electrostatic coefficient");
 
@@ -804,7 +890,7 @@ void pme_gpu_destroy(PmeGpu* pmeGpu)
     delete pmeGpu;
 }
 
-void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
+void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* chargesA, const real* chargesB)
 {
     auto* kernelParamsPtr         = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
     kernelParamsPtr->atoms.nAtoms = nAtoms;
@@ -817,8 +903,25 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
     GMX_RELEASE_ASSERT(false, "Only single precision supported");
     GMX_UNUSED_VALUE(charges);
 #else
-    pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast<const float*>(charges));
+    int gridIndex = 0;
     /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
+    pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast<const float*>(chargesA), gridIndex);
+    gridIndex++;
+    if (chargesB != nullptr)
+    {
+        pme_gpu_realloc_and_copy_input_coefficients(
+                pmeGpu, reinterpret_cast<const float*>(chargesB), gridIndex);
+    }
+    else
+    {
+        /* Fill the second set of coefficients with chargesA as well to be able to avoid
+         * conditionals in the GPU kernels */
+        /* FIXME: This should be avoided by making a separate templated version of the
+         * relevant kernel(s) (probably only pme_gather_kernel). That would require a
+         * reduction of the current number of templated parameters of that kernel. */
+        pme_gpu_realloc_and_copy_input_coefficients(
+                pmeGpu, reinterpret_cast<const float*>(chargesA), gridIndex);
+    }
 #endif
 
     if (haveToRealloc)
@@ -849,7 +952,7 @@ static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PME
     return timingEvent;
 }
 
-void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index)
+void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, const int grid_index)
 {
     int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
 
@@ -881,32 +984,64 @@ std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu* pmeGpu, int blockCount
  * \param[in]  pmeGpu                   The PME GPU structure.
  * \param[in]  threadsPerAtom           Controls whether we should use order or order*order threads per atom
  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ * \param[in]  numGrids                 Number of grids to use. numGrids == 2 if Coulomb is perturbed.
  *
  * \return Pointer to CUDA kernel
  */
-static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal)
+static auto selectSplineAndSpreadKernelPtr(const PmeGpu*  pmeGpu,
+                                           ThreadsPerAtom threadsPerAtom,
+                                           bool           writeSplinesToGlobal,
+                                           const int      numGrids)
 {
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
     if (writeSplinesToGlobal)
     {
         if (threadsPerAtom == ThreadsPerAtom::Order)
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4Dual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4Single;
+            }
         }
         else
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesDual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesSingle;
+            }
         }
     }
     else
     {
         if (threadsPerAtom == ThreadsPerAtom::Order)
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Dual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Single;
+            }
         }
         else
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelDual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelSingle;
+            }
         }
     }
 
@@ -919,12 +1054,14 @@ static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom
  * \param[in]  pmeGpu                   The PME GPU structure.
  * \param[in]  threadsPerAtom           Controls whether we should use order or order*order threads per atom
  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ * \param[in]  numGrids                 Number of grids to use. numGrids == 2 if Coulomb is perturbed.
  *
  * \return Pointer to CUDA kernel
  */
 static auto selectSplineKernelPtr(const PmeGpu*  pmeGpu,
                                   ThreadsPerAtom threadsPerAtom,
-                                  bool gmx_unused writeSplinesToGlobal)
+                                  bool gmx_unused writeSplinesToGlobal,
+                                  const int       numGrids)
 {
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
     GMX_ASSERT(
@@ -933,11 +1070,25 @@ static auto selectSplineKernelPtr(const PmeGpu*  pmeGpu,
 
     if (threadsPerAtom == ThreadsPerAtom::Order)
     {
-        kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4;
+        if (numGrids == 2)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4Dual;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4Single;
+        }
     }
     else
     {
-        kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+        if (numGrids == 2)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineKernelDual;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineKernelSingle;
+        }
     }
     return kernelPtr;
 }
@@ -948,21 +1099,38 @@ static auto selectSplineKernelPtr(const PmeGpu*  pmeGpu,
  * \param[in]  pmeGpu                   The PME GPU structure.
  * \param[in]  threadsPerAtom           Controls whether we should use order or order*order threads per atom
  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ * \param[in]  numGrids                 Number of grids to use. numGrids == 2 if Coulomb is perturbed.
  *
  * \return Pointer to CUDA kernel
  */
-static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal)
+static auto selectSpreadKernelPtr(const PmeGpu*  pmeGpu,
+                                  ThreadsPerAtom threadsPerAtom,
+                                  bool           writeSplinesToGlobal,
+                                  const int      numGrids)
 {
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
     if (writeSplinesToGlobal)
     {
         if (threadsPerAtom == ThreadsPerAtom::Order)
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Dual;
+            }
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Single;
+            }
         }
         else
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelDual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelSingle;
+            }
         }
     }
     else
@@ -971,11 +1139,25 @@ static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPe
            using the spline and spread Kernel */
         if (threadsPerAtom == ThreadsPerAtom::Order)
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Dual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Single;
+            }
         }
         else
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelDual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelSingle;
+            }
         }
     }
     return kernelPtr;
@@ -983,14 +1165,18 @@ static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPe
 
 void pme_gpu_spread(const PmeGpu*         pmeGpu,
                     GpuEventSynchronizer* xReadyOnDevice,
-                    int gmx_unused gridIndex,
-                    real*          h_grid,
-                    bool           computeSplines,
-                    bool           spreadCharges)
+                    real**                h_grids,
+                    bool                  computeSplines,
+                    bool                  spreadCharges,
+                    const real            lambda)
 {
+    GMX_ASSERT(
+            pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+            "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
     GMX_ASSERT(computeSplines || spreadCharges,
                "PME spline/spread kernel has invalid input (nothing to do)");
-    const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+    auto* kernelParamsPtr = pmeGpu->kernelParams.get();
     GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
 
     const size_t blockSize = pmeGpu->programHandle_->impl_->spreadWorkGroupSize;
@@ -1031,6 +1217,15 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
+    if (pmeGpu->common->ngrids == 1)
+    {
+        kernelParamsPtr->current.scale = 1.0;
+    }
+    else
+    {
+        kernelParamsPtr->current.scale = 1.0 - lambda;
+    }
+
     KernelLaunchConfig config;
     config.blockSize[0] = order;
     config.blockSize[1] = (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? 1 : order);
@@ -1046,20 +1241,22 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
         {
             timingId  = gtPME_SPLINEANDSPREAD;
             kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
-                                                       writeGlobal || (!recalculateSplines));
+                                                       writeGlobal || (!recalculateSplines),
+                                                       pmeGpu->common->ngrids);
         }
         else
         {
             timingId  = gtPME_SPLINE;
             kernelPtr = selectSplineKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
-                                              writeGlobal || (!recalculateSplines));
+                                              writeGlobal || (!recalculateSplines),
+                                              pmeGpu->common->ngrids);
         }
     }
     else
     {
         timingId  = gtPME_SPREAD;
         kernelPtr = selectSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
-                                          writeGlobal || (!recalculateSplines));
+                                          writeGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
     }
 
 
@@ -1071,9 +1268,10 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     const auto kernelArgs = prepareGpuKernelArguments(
             kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_theta,
             &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
-            &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->grid.d_fractShiftsTable,
-            &kernelParamsPtr->grid.d_gridlineIndicesTable, &kernelParamsPtr->atoms.d_coefficients,
-            &kernelParamsPtr->atoms.d_coordinates);
+            &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+            &kernelParamsPtr->grid.d_fractShiftsTable, &kernelParamsPtr->grid.d_gridlineIndicesTable,
+            &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+            &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], &kernelParamsPtr->atoms.d_coordinates);
 #endif
 
     launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent,
@@ -1084,7 +1282,11 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs);
     if (copyBackGrid)
     {
-        pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
+        for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+        {
+            float* h_grid = h_grids[gridIndex];
+            pme_gpu_copy_output_spread_grid(pmeGpu, h_grid, gridIndex);
+        }
     }
     const bool copyBackAtomData =
             computeSplines && (!settings.performGPUGather || settings.copyAllOutputs);
@@ -1094,8 +1296,18 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     }
 }
 
-void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
+void pme_gpu_solve(const PmeGpu* pmeGpu,
+                   const int     gridIndex,
+                   t_complex*    h_grid,
+                   GridOrdering  gridOrdering,
+                   bool          computeEnergyAndVirial)
 {
+    GMX_ASSERT(
+            pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+            "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+    GMX_ASSERT(gridIndex < pmeGpu->common->ngrids,
+               "Invalid combination of gridIndex and number of grids");
+
     const auto& settings               = pmeGpu->settings;
     const bool  copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs;
 
@@ -1104,9 +1316,9 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
     float* h_gridFloat = reinterpret_cast<float*>(h_grid);
     if (copyInputAndOutputGrid)
     {
-        copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0,
-                           pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
-                           pmeGpu->settings.transferKind, nullptr);
+        copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], h_gridFloat, 0,
+                           pmeGpu->archSpecific->complexGridSize[gridIndex],
+                           pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     }
 
     int majorDim = -1, middleDim = -1, minorDim = -1;
@@ -1160,13 +1372,29 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
     if (gridOrdering == GridOrdering::YZX)
     {
-        kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernel
-                                           : pmeGpu->programHandle_->impl_->solveYZXKernel;
+        if (gridIndex == 0)
+        {
+            kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernelA
+                                               : pmeGpu->programHandle_->impl_->solveYZXKernelA;
+        }
+        else
+        {
+            kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernelB
+                                               : pmeGpu->programHandle_->impl_->solveYZXKernelB;
+        }
     }
     else if (gridOrdering == GridOrdering::XYZ)
     {
-        kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernel
-                                           : pmeGpu->programHandle_->impl_->solveXYZKernel;
+        if (gridIndex == 0)
+        {
+            kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernelA
+                                               : pmeGpu->programHandle_->impl_->solveXYZKernelA;
+        }
+        else
+        {
+            kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernelB
+                                               : pmeGpu->programHandle_->impl_->solveXYZKernelB;
+        }
     }
 
     pme_gpu_start_timing(pmeGpu, timingId);
@@ -1175,8 +1403,9 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
 #else
     const auto kernelArgs = prepareGpuKernelArguments(
-            kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli,
-            &kernelParamsPtr->constants.d_virialAndEnergy, &kernelParamsPtr->grid.d_fourierGrid);
+            kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli[gridIndex],
+            &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
+            &kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
 #endif
     launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve",
                     kernelArgs);
@@ -1184,16 +1413,17 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
 
     if (computeEnergyAndVirial)
     {
-        copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy,
-                             &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount,
-                             pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+        copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy[gridIndex],
+                             &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex], 0,
+                             c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_,
+                             pmeGpu->settings.transferKind, nullptr);
     }
 
     if (copyInputAndOutputGrid)
     {
-        copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0,
-                             pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
-                             pmeGpu->settings.transferKind, nullptr);
+        copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid[gridIndex], 0,
+                             pmeGpu->archSpecific->complexGridSize[gridIndex],
+                             pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
     }
 }
 
@@ -1203,10 +1433,14 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
  * \param[in]  pmeGpu                   The PME GPU structure.
  * \param[in]  threadsPerAtom           Controls whether we should use order or order*order threads per atom
  * \param[in]  readSplinesFromGlobal    bool controlling if we should write spline data to global memory
+ * \param[in]  numGrids                 Number of grids to use. numGrids == 2 if Coulomb is perturbed.
  *
  * \return Pointer to CUDA kernel
  */
-inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool readSplinesFromGlobal)
+inline auto selectGatherKernelPtr(const PmeGpu*  pmeGpu,
+                                  ThreadsPerAtom threadsPerAtom,
+                                  bool           readSplinesFromGlobal,
+                                  const int      numGrids)
 
 {
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
@@ -1215,34 +1449,70 @@ inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPe
     {
         if (threadsPerAtom == ThreadsPerAtom::Order)
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4Dual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4Single;
+            }
         }
         else
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplines;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesDual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesSingle;
+            }
         }
     }
     else
     {
         if (threadsPerAtom == ThreadsPerAtom::Order)
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4Dual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4Single;
+            }
         }
         else
         {
-            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernel;
+            if (numGrids == 2)
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelDual;
+            }
+            else
+            {
+                kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelSingle;
+            }
         }
     }
     return kernelPtr;
 }
 
-
-void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
+void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
 {
+    GMX_ASSERT(
+            pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+            "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
     const auto& settings = pmeGpu->settings;
+
     if (!settings.performGPUFFT || settings.copyAllOutputs)
     {
-        pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float*>(h_grid));
+        for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+        {
+            float* h_grid = const_cast<float*>(h_grids[gridIndex]);
+            pme_gpu_copy_input_gather_grid(pmeGpu, h_grid, gridIndex);
+        }
     }
 
     if (settings.copyAllOutputs)
@@ -1280,22 +1550,33 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
 
     // TODO test different cache configs
 
-    int                                timingId  = gtPME_GATHER;
-    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr(
-            pmeGpu, pmeGpu->settings.threadsPerAtom, readGlobal || (!recalculateSplines));
+    int                                timingId = gtPME_GATHER;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
+            selectGatherKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
+                                  readGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
     // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
 
     pme_gpu_start_timing(pmeGpu, timingId);
-    auto*       timingEvent     = pme_gpu_fetch_timing_event(pmeGpu, timingId);
-    const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+    auto* timingEvent     = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+    auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+    if (pmeGpu->common->ngrids == 1)
+    {
+        kernelParamsPtr->current.scale = 1.0;
+    }
+    else
+    {
+        kernelParamsPtr->current.scale = 1.0 - lambda;
+    }
+
 #if c_canEmbedBuffers
     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
 #else
     const auto kernelArgs = prepareGpuKernelArguments(
-            kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients,
-            &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->atoms.d_theta,
-            &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
-            &kernelParamsPtr->atoms.d_forces);
+            kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+            &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+            &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+            &kernelParamsPtr->atoms.d_theta, &kernelParamsPtr->atoms.d_dtheta,
+            &kernelParamsPtr->atoms.d_gridlineIndices, &kernelParamsPtr->atoms.d_forces);
 #endif
     launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather",
                     kernelArgs);
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index f2cbc94c5d..1220b13984 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -69,6 +69,15 @@ struct PmeGpuStaging;
 struct PmeGpuSettings;
 struct t_complex;
 
+#ifndef FEP_STATE_A
+//! Grid index of FEP state A (or unperturbed system)
+#    define FEP_STATE_A 0
+#endif
+#ifndef FEP_STATE_B
+//! Grid index of FEP state B
+#    define FEP_STATE_B 1
+#endif
+
 namespace gmx
 {
 template<typename>
@@ -134,8 +143,10 @@ void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu);
  * Reallocates and copies the pre-computed B-spline values to the GPU.
  *
  * \param[in,out] pmeGpu             The PME GPU structure.
+ * \param[in]     gridIndex          The index of the grid to use. 0 is Coulomb in the normal
+ *                                   state or FEP state A and 1 is Coulomb in FEP state B.
  */
-void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu);
+void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, int gridIndex = 0);
 
 /*! \libinternal \brief
  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
@@ -188,11 +199,15 @@ bool pme_gpu_stream_query(const PmeGpu* pmeGpu);
  *
  * \param[in] pmeGpu            The PME GPU structure.
  * \param[in] h_coefficients    The input atom charges/coefficients.
+ * \param[in] gridIndex         The index of the grid to use. 0 is Coulomb in the normal
+ *                              state or FEP state A and 1 is Coulomb in FEP state B.
  *
  * Does not need to be done for every PME computation, only whenever the local charges change.
  * (So, in the beginning of the run, or on DD step).
  */
-void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients);
+void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
+                                                 const float*  h_coefficients,
+                                                 int           gridIndex = 0);
 
 /*! \libinternal \brief
  * Frees the charges/coefficients on the GPU.
@@ -268,30 +283,34 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu);
 /*! \libinternal \brief
  * Copies the input real-space grid from the host to the GPU.
  *
- * \param[in] pmeGpu   The PME GPU structure.
- * \param[in] h_grid   The host-side grid buffer.
+ * \param[in] pmeGpu    The PME GPU structure.
+ * \param[in] h_grid    The host-side grid buffer.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ *                      state or FEP state A and 1 is Coulomb in FEP state B.
  */
-void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid);
+void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, int gridIndex = 0);
 
 /*! \libinternal \brief
  * Copies the output real-space grid from the GPU to the host.
  *
- * \param[in] pmeGpu   The PME GPU structure.
- * \param[out] h_grid  The host-side grid buffer.
+ * \param[in] pmeGpu    The PME GPU structure.
+ * \param[out] h_grid   The host-side grid buffer.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ *                      state or FEP state A and 1 is Coulomb in FEP state B.
  */
-void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid);
+void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, int gridIndex = 0);
 
 /*! \libinternal \brief
  * Copies the spread output spline data and gridline indices from the GPU to the host.
  *
- * \param[in] pmeGpu   The PME GPU structure.
+ * \param[in] pmeGpu    The PME GPU structure.
  */
 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu);
 
 /*! \libinternal \brief
  * Copies the gather input spline data and gridline indices from the host to the GPU.
  *
- * \param[in] pmeGpu   The PME GPU structure.
+ * \param[in] pmeGpu    The PME GPU structure.
  */
 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
 
@@ -321,40 +340,44 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu);
 /*! \libinternal \brief
  * A GPU spline computation and charge spreading function.
  *
- * \param[in]  pmeGpu          The PME GPU structure.
- * \param[in]  xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory;
- *                             can be nullptr when invoked on a separate PME rank or from PME tests.
- * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
- * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
- *                             e.g. testing or host-side FFT)
- * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
- * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
+ * \param[in]  pmeGpu                 The PME GPU structure.
+ * \param[in]  xReadyOnDevice         Event synchronizer indicating that the coordinates are ready in the device memory;
+ *                                    can be nullptr when invoked on a separate PME rank or from PME tests.
+ * \param[out] h_grids                The host-side grid buffers (used only if the result of the spread is expected on the host,
+ *                                    e.g. testing or host-side FFT)
+ * \param[in]  computeSplines         Should the computation of spline parameters and gridline indices be performed.
+ * \param[in]  spreadCharges          Should the charges/coefficients be spread on the grid.
+ * \param[in]  lambda                 The lambda value of the current system state.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu*         GPU_FUNC_ARGUMENT(pmeGpu),
                                        GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
-                                       int                   GPU_FUNC_ARGUMENT(gridIndex),
-                                       real*                 GPU_FUNC_ARGUMENT(h_grid),
+                                       float**               GPU_FUNC_ARGUMENT(h_grids),
                                        bool                  GPU_FUNC_ARGUMENT(computeSplines),
-                                       bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+                                       bool                  GPU_FUNC_ARGUMENT(spreadCharges),
+                                       const real GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
 
 /*! \libinternal \brief
  * 3D FFT R2C/C2R routine.
  *
  * \param[in]  pmeGpu          The PME GPU structure.
  * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
- * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
+ * \param[in]  gridIndex       The index of the grid to use. 0 is Coulomb in the normal
+ *                             state or FEP state A and 1 is Coulomb in FEP state B.
  */
-void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex);
+void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex = 0);
 
 /*! \libinternal \brief
  * A GPU Fourier space solving function.
  *
  * \param[in]     pmeGpu                  The PME GPU structure.
+ * \param[in]     gridIndex               The index of the grid to use. 0 is Coulomb in the normal
+ *                                        state or FEP state A and 1 is Coulomb in FEP state B.
  * \param[in,out] h_grid                  The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
  * \param[in]     gridOrdering            Specifies the dimenion ordering of the complex grid. TODO: store this information?
  * \param[in]     computeEnergyAndVirial  Tells if the energy and virial computation should be performed.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+                                      int           GPU_FUNC_ARGUMENT(gridIndex),
                                       t_complex*    GPU_FUNC_ARGUMENT(h_grid),
                                       GridOrdering  GPU_FUNC_ARGUMENT(gridOrdering),
                                       bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM;
@@ -362,11 +385,14 @@ GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 /*! \libinternal \brief
  * A GPU force gathering function.
  *
- * \param[in]     pmeGpu           The PME GPU structure.
- * reductions. \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
+ * \param[in]     pmeGpu                   The PME GPU structure.
+ * \param[in]     h_grids                  The host-side grid buffer (used only in testing mode).
+ * \param[in]     lambda                   The lambda value to use.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu*      GPU_FUNC_ARGUMENT(pmeGpu),
-                                       const float* GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+                                       float** GPU_FUNC_ARGUMENT(h_grids),
+                                       float   GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
+
 
 /*! \brief Sets the device pointer to coordinate data
  * \param[in] pmeGpu         The PME GPU structure.
@@ -435,9 +461,11 @@ inline void pme_gpu_set_testing(PmeGpu* pmeGpu, bool testing)
  * handled the solve stage.
  *
  * \param[in] pme                The PME structure.
+ * \param[in] lambda             The lambda value to use when calculating the results.
  * \param[out] output            Pointer to output where energy and virial should be stored.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
+                                                   float            GPU_FUNC_ARGUMENT(lambda),
                                                    PmeOutput* GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM;
 
 /*! \libinternal \brief
@@ -445,10 +473,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARG
  *
  * \param[in] pme                     The PME structure.
  * \param[in] computeEnergyAndVirial  Whether the energy and virial are being computed
+ * \param[in] lambdaQ            The Coulomb lambda to use when finalizing the output.
  * \returns                           The output object.
  */
 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_getOutput(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
-                                               bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial))
+                                               bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
+                                               real GPU_FUNC_ARGUMENT(lambdaQ))
         GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
 
 /*! \libinternal \brief
@@ -510,14 +540,16 @@ GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_F
  *
  * \param[in] pmeGpu    The PME GPU structure.
  * \param[in] nAtoms    The number of particles.
- * \param[in] charges   The pointer to the host-side array of particle charges.
+ * \param[in] chargesA  The pointer to the host-side array of particle charges in the unperturbed state or FEP state A.
+ * \param[in] chargesB  The pointer to the host-side array of particle charges in FEP state B.
  *
  * This is a function that should only be called in the beginning of the run and on domain
  * decomposition. Should be called before the pme_gpu_set_io_ranges.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu*     GPU_FUNC_ARGUMENT(pmeGpu),
                                              int         GPU_FUNC_ARGUMENT(nAtoms),
-                                             const real* GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM;
+                                             const real* GPU_FUNC_ARGUMENT(chargesA),
+                                             const real* GPU_FUNC_ARGUMENT(chargesB) = nullptr) GPU_FUNC_TERM;
 
 /*! \brief \libinternal
  * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
@@ -534,11 +566,13 @@ void pme_gpu_reinit_computation(const PmeGpu* pmeGpu);
  *
  * \param[in]  pme                     The PME data structure.
  * \param[in]  computeEnergyAndVirial  Tells if the energy and virial computation should be performed.
+ * \param[in]  lambdaQ                 The Coulomb lambda to use when calculating the results.
  * \param[out] wcycle                  The wallclock counter.
  * \return                             The output forces, energy and virial
  */
 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
                                                       bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
+                                                      real           GPU_FUNC_ARGUMENT(lambdaQ),
                                                       gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle))
         GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
 
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu
index 84bac0a467..1358793807 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.cu
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cu
@@ -55,53 +55,79 @@ constexpr int c_pmeOrder = 4;
 constexpr bool c_wrapX = true;
 constexpr bool c_wrapY = true;
 
+constexpr int c_stateA = 0;
+constexpr int c_stateB = 1;
+
 //! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
-template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int mode, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
 void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
 // clang-format off
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true,  false, c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true,  false, c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, false, true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, false, true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
 extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-// clang-format on
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 
-template<GridOrdering gridOrdering, bool computeEnergyAndVirial>
+template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex> /* It is significantly slower to pass gridIndex as a kernel parameter */
 void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
 // clang-format off
-extern template void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::XYZ, true> (const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, true> (const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, false, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, true, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, false, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, true, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, false, c_stateB>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, true, c_stateB>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, false, c_stateB>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, true, c_stateB>(const PmeGpuCudaKernelParams);
 // clang-format on
 
-template<int order, bool wrapX, bool wrapY, bool readGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool wrapX, bool wrapY, int nGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
 void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
 // clang-format off
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>          (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>         (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>   (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>  (const PmeGpuCudaKernelParams);
 // clang-format on
 
 PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
@@ -118,22 +144,38 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
      * Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory
      */
     // clang-format off
-    splineAndSpreadKernel                       = pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>;
-    splineAndSpreadKernelThPerAtom4             = pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, false, ThreadsPerAtom::Order>;
-    splineAndSpreadKernelWriteSplines           = pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>;
-    splineAndSpreadKernelWriteSplinesThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, true,  true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>;
-    splineKernel                                = pme_spline_and_spread_kernel<c_pmeOrder, true,  false, c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>;
-    splineKernelThPerAtom4                      = pme_spline_and_spread_kernel<c_pmeOrder, true,  false, c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>;
-    spreadKernel                                = pme_spline_and_spread_kernel<c_pmeOrder, false, true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>;
-    spreadKernelThPerAtom4                      = pme_spline_and_spread_kernel<c_pmeOrder, false, true,  c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>;
-    gatherKernel                                = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>;
-    gatherKernelThPerAtom4                      = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::Order>;
-    gatherKernelReadSplines                     = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true,  ThreadsPerAtom::OrderSquared>;
-    gatherKernelReadSplinesThPerAtom4           = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true,  ThreadsPerAtom::Order>;
-    solveXYZKernel                              = pme_solve_kernel<GridOrdering::XYZ, false>;
-    solveXYZEnergyKernel                        = pme_solve_kernel<GridOrdering::XYZ, true>;
-    solveYZXKernel                              = pme_solve_kernel<GridOrdering::YZX, false>;
-    solveYZXEnergyKernel                        = pme_solve_kernel<GridOrdering::YZX, true>;
+    splineAndSpreadKernelSingle                       = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>;
+    splineAndSpreadKernelThPerAtom4Single             = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>;
+    splineAndSpreadKernelWriteSplinesSingle           = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+    splineAndSpreadKernelWriteSplinesThPerAtom4Single = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+    splineKernelSingle                                = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+    splineKernelThPerAtom4Single                      = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+    spreadKernelSingle                                = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+    spreadKernelThPerAtom4Single                      = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+    splineAndSpreadKernelDual                         = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>;
+    splineAndSpreadKernelThPerAtom4Dual               = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>;
+    splineAndSpreadKernelWriteSplinesDual             = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+    splineAndSpreadKernelWriteSplinesThPerAtom4Dual   = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+    splineKernelDual                                  = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+    splineKernelThPerAtom4Dual                        = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+    spreadKernelDual                                  = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+    spreadKernelThPerAtom4Dual                        = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+    gatherKernelSingle                                = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>;
+    gatherKernelThPerAtom4Single                      = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>;
+    gatherKernelReadSplinesSingle                     = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+    gatherKernelReadSplinesThPerAtom4Single           = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+    gatherKernelDual                                  = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>;
+    gatherKernelThPerAtom4Dual                        = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>;
+    gatherKernelReadSplinesDual                       = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+    gatherKernelReadSplinesThPerAtom4Dual             = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+    solveXYZKernelA                                   = pme_solve_kernel<GridOrdering::XYZ, false, c_stateA>;
+    solveXYZEnergyKernelA                             = pme_solve_kernel<GridOrdering::XYZ, true, c_stateA>;
+    solveYZXKernelA                                   = pme_solve_kernel<GridOrdering::YZX, false, c_stateA>;
+    solveYZXEnergyKernelA                             = pme_solve_kernel<GridOrdering::YZX, true, c_stateA>;
+    solveXYZKernelB                                   = pme_solve_kernel<GridOrdering::XYZ, false, c_stateB>;
+    solveXYZEnergyKernelB                             = pme_solve_kernel<GridOrdering::XYZ, true, c_stateB>;
+    solveYZXKernelB                                   = pme_solve_kernel<GridOrdering::YZX, false, c_stateB>;
+    solveYZXEnergyKernelB                             = pme_solve_kernel<GridOrdering::YZX, true, c_stateB>;
     // clang-format on
 }
 
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h
index 0a7cc1a5c5..75d3f881d0 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.h
+++ b/src/gromacs/ewald/pme_gpu_program_impl.h
@@ -105,17 +105,28 @@ struct PmeGpuProgramImpl
      *   or recalculated in the gather.
      * Spreading kernels also have hardcoded X/Y indices wrapping parameters,
      * as a placeholder for implementing 1/2D decomposition.
+     * The kernels are templated separately for spreading on one grid (one or
+     * two sets of coefficients) or on two grids (required for energy and virial
+     * calculations).
      */
     size_t spreadWorkGroupSize;
 
-    PmeKernelHandle splineKernel;
-    PmeKernelHandle splineKernelThPerAtom4;
-    PmeKernelHandle spreadKernel;
-    PmeKernelHandle spreadKernelThPerAtom4;
-    PmeKernelHandle splineAndSpreadKernel;
-    PmeKernelHandle splineAndSpreadKernelThPerAtom4;
-    PmeKernelHandle splineAndSpreadKernelWriteSplines;
-    PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4;
+    PmeKernelHandle splineKernelSingle;
+    PmeKernelHandle splineKernelThPerAtom4Single;
+    PmeKernelHandle spreadKernelSingle;
+    PmeKernelHandle spreadKernelThPerAtom4Single;
+    PmeKernelHandle splineAndSpreadKernelSingle;
+    PmeKernelHandle splineAndSpreadKernelThPerAtom4Single;
+    PmeKernelHandle splineAndSpreadKernelWriteSplinesSingle;
+    PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Single;
+    PmeKernelHandle splineKernelDual;
+    PmeKernelHandle splineKernelThPerAtom4Dual;
+    PmeKernelHandle spreadKernelDual;
+    PmeKernelHandle spreadKernelThPerAtom4Dual;
+    PmeKernelHandle splineAndSpreadKernelDual;
+    PmeKernelHandle splineAndSpreadKernelThPerAtom4Dual;
+    PmeKernelHandle splineAndSpreadKernelWriteSplinesDual;
+    PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Dual;
     //@}
 
     //@{
@@ -123,25 +134,36 @@ struct PmeGpuProgramImpl
      * it can either reduce with previous forces in the host buffer, or ignore them.
      * Also similarly to the gather we can use either order(4) or order*order (16) threads per atom
      * and either recalculate the splines or read the ones written by the spread
+     * The kernels are templated separately for using one or two grids (required for
+     * calculating energies and virial).
      */
     size_t gatherWorkGroupSize;
 
-    PmeKernelHandle gatherKernel;
-    PmeKernelHandle gatherKernelThPerAtom4;
-    PmeKernelHandle gatherKernelReadSplines;
-    PmeKernelHandle gatherKernelReadSplinesThPerAtom4;
+    PmeKernelHandle gatherKernelSingle;
+    PmeKernelHandle gatherKernelThPerAtom4Single;
+    PmeKernelHandle gatherKernelReadSplinesSingle;
+    PmeKernelHandle gatherKernelReadSplinesThPerAtom4Single;
+    PmeKernelHandle gatherKernelDual;
+    PmeKernelHandle gatherKernelThPerAtom4Dual;
+    PmeKernelHandle gatherKernelReadSplinesDual;
+    PmeKernelHandle gatherKernelReadSplinesThPerAtom4Dual;
     //@}
 
     //@{
     /** Solve kernel doesn't care about the interpolation order, but can optionally
      * compute energy and virial, and supports XYZ and YZX grid orderings.
+     * The kernels are templated separately for grids in state A and B.
      */
     size_t solveMaxWorkGroupSize;
 
-    PmeKernelHandle solveYZXKernel;
-    PmeKernelHandle solveXYZKernel;
-    PmeKernelHandle solveYZXEnergyKernel;
-    PmeKernelHandle solveXYZEnergyKernel;
+    PmeKernelHandle solveYZXKernelA;
+    PmeKernelHandle solveXYZKernelA;
+    PmeKernelHandle solveYZXEnergyKernelA;
+    PmeKernelHandle solveXYZEnergyKernelA;
+    PmeKernelHandle solveYZXKernelB;
+    PmeKernelHandle solveXYZKernelB;
+    PmeKernelHandle solveYZXEnergyKernelB;
+    PmeKernelHandle solveXYZEnergyKernelB;
     //@}
 
     PmeGpuProgramImpl() = delete;
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
index 6be367b6b3..c88baf40e5 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -72,14 +72,22 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
 {
     // TODO: log releasing errors
     cl_int gmx_used_in_debug stat = 0;
-    stat |= clReleaseKernel(splineAndSpreadKernel);
-    stat |= clReleaseKernel(splineKernel);
-    stat |= clReleaseKernel(spreadKernel);
-    stat |= clReleaseKernel(gatherKernel);
-    stat |= clReleaseKernel(solveXYZKernel);
-    stat |= clReleaseKernel(solveXYZEnergyKernel);
-    stat |= clReleaseKernel(solveYZXKernel);
-    stat |= clReleaseKernel(solveYZXEnergyKernel);
+    stat |= clReleaseKernel(splineAndSpreadKernelSingle);
+    stat |= clReleaseKernel(splineKernelSingle);
+    stat |= clReleaseKernel(spreadKernelSingle);
+    stat |= clReleaseKernel(splineAndSpreadKernelDual);
+    stat |= clReleaseKernel(splineKernelDual);
+    stat |= clReleaseKernel(spreadKernelDual);
+    stat |= clReleaseKernel(gatherKernelSingle);
+    stat |= clReleaseKernel(gatherKernelDual);
+    stat |= clReleaseKernel(solveXYZKernelA);
+    stat |= clReleaseKernel(solveXYZEnergyKernelA);
+    stat |= clReleaseKernel(solveYZXKernelA);
+    stat |= clReleaseKernel(solveYZXEnergyKernelA);
+    stat |= clReleaseKernel(solveXYZKernelB);
+    stat |= clReleaseKernel(solveXYZEnergyKernelB);
+    stat |= clReleaseKernel(solveYZXKernelB);
+    stat |= clReleaseKernel(solveYZXEnergyKernelB);
     GMX_ASSERT(stat == CL_SUCCESS,
                gmx::formatString("Failed to release PME OpenCL resources %d: %s", stat,
                                  ocl_get_error_string(stat).c_str())
@@ -163,7 +171,7 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 
-    constexpr cl_uint expectedKernelCount = 9;
+    constexpr cl_uint expectedKernelCount = 18;
     // Has to be equal or larger than the number of kernel instances.
     // If it is not, CL_INVALID_VALUE will be thrown.
     std::vector<cl_kernel> kernels(expectedKernelCount, nullptr);
@@ -193,42 +201,79 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
 
         // The names below must correspond to those defined in pme_program.cl
         // TODO use a map with string key instead?
-        if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernel"))
+        if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernelSingle"))
         {
-            splineKernel = kernel;
+            splineKernelSingle = kernel;
         }
-        else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernelSingle"))
         {
-            splineAndSpreadKernel             = kernel;
-            splineAndSpreadKernelWriteSplines = kernel;
-            checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
+            splineAndSpreadKernelSingle             = kernel;
+            splineAndSpreadKernelWriteSplinesSingle = kernel;
+            checkRequiredWarpSize(splineAndSpreadKernelSingle, kernelNamesBuffer.data(), deviceInfo);
         }
-        else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernelSingle"))
         {
-            spreadKernel = kernel;
-            checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
+            spreadKernelSingle = kernel;
+            checkRequiredWarpSize(spreadKernelSingle, kernelNamesBuffer.data(), deviceInfo);
         }
-        else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernelDual"))
         {
-            gatherKernel            = kernel;
-            gatherKernelReadSplines = kernel;
-            checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
+            splineKernelDual = kernel;
         }
-        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernelDual"))
         {
-            solveYZXKernel = kernel;
+            splineAndSpreadKernelDual             = kernel;
+            splineAndSpreadKernelWriteSplinesDual = kernel;
+            checkRequiredWarpSize(splineAndSpreadKernelDual, kernelNamesBuffer.data(), deviceInfo);
         }
-        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernel"))
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernelDual"))
         {
-            solveYZXEnergyKernel = kernel;
+            spreadKernelDual = kernel;
+            checkRequiredWarpSize(spreadKernelDual, kernelNamesBuffer.data(), deviceInfo);
         }
-        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernel"))
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernelSingle"))
         {
-            solveXYZKernel = kernel;
+            gatherKernelSingle            = kernel;
+            gatherKernelReadSplinesSingle = kernel;
+            checkRequiredWarpSize(gatherKernelSingle, kernelNamesBuffer.data(), deviceInfo);
         }
-        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernel"))
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernelDual"))
         {
-            solveXYZEnergyKernel = kernel;
+            gatherKernelDual            = kernel;
+            gatherKernelReadSplinesDual = kernel;
+            checkRequiredWarpSize(gatherKernelDual, kernelNamesBuffer.data(), deviceInfo);
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernelA"))
+        {
+            solveYZXKernelA = kernel;
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernelA"))
+        {
+            solveYZXEnergyKernelA = kernel;
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernelA"))
+        {
+            solveXYZKernelA = kernel;
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernelA"))
+        {
+            solveXYZEnergyKernelA = kernel;
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernelB"))
+        {
+            solveYZXKernelB = kernel;
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernelB"))
+        {
+            solveYZXEnergyKernelB = kernel;
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernelB"))
+        {
+            solveXYZKernelB = kernel;
+        }
+        else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernelB"))
+        {
+            solveXYZEnergyKernelB = kernel;
         }
     }
     clReleaseProgram(program);
diff --git a/src/gromacs/ewald/pme_gpu_staging.h b/src/gromacs/ewald/pme_gpu_staging.h
index 4a95c0cda8..7439027639 100644
--- a/src/gromacs/ewald/pme_gpu_staging.h
+++ b/src/gromacs/ewald/pme_gpu_staging.h
@@ -54,6 +54,11 @@
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/math/vectypes.h"
 
+#ifndef NUM_STATES
+//! Number of FEP states.
+#    define NUM_STATES 2
+#endif
+
 /*! \internal \brief
  * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value.
  * Buffers are managed by the PME GPU module.
@@ -64,9 +69,9 @@ struct PmeGpuStaging
     gmx::PaddedHostVector<gmx::RVec> h_forces;
 
     /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
-    float* h_virialAndEnergy;
+    float* h_virialAndEnergy[NUM_STATES];
     /*! \brief B-spline values intermediate host-side buffer. */
-    float* h_splineModuli;
+    float* h_splineModuli[NUM_STATES];
 
     /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */
     float* h_theta;
diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h
index 97f8416499..4274f095d9 100644
--- a/src/gromacs/ewald/pme_gpu_types.h
+++ b/src/gromacs/ewald/pme_gpu_types.h
@@ -82,6 +82,11 @@ static_assert(sizeof(DeviceBuffer<int>) == 8,
 #    define HIDE_FROM_OPENCL_COMPILER(x) char8
 #endif
 
+#ifndef NUMFEPSTATES
+//! Number of FEP states.
+#    define NUMFEPSTATES 2
+#endif
+
 /* What follows is all the PME GPU function arguments,
  * sorted into several device-side structures depending on the update rate.
  * This is GPU agnostic (float3 replaced by float[3], etc.).
@@ -99,7 +104,7 @@ struct PmeGpuConstParams
     float elFactor;
     /*! \brief Virial and energy GPU array. Size is c_virialAndEnergyCount (7) floats.
      * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_virialAndEnergy;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_virialAndEnergy[NUMFEPSTATES];
 };
 
 /*! \internal \brief
@@ -130,14 +135,14 @@ struct PmeGpuGridParams
 
     /* Grid arrays */
     /*! \brief Real space grid. */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_realGrid;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_realGrid[NUMFEPSTATES];
     /*! \brief Complex grid - used in FFT/solve. If inplace cu/clFFT is used, then it is the same handle as realGrid. */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fourierGrid;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fourierGrid[NUMFEPSTATES];
 
     /*! \brief Grid spline values as in pme->bsp_mod
      * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
      */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_splineModuli;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_splineModuli[NUMFEPSTATES];
     /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fractShiftsTable;
     /*! \brief Gridline indices lookup table
@@ -158,10 +163,10 @@ struct PmeGpuAtomParams
      * but reallocation happens only at DD.
      */
     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_coordinates;
-    /*! \brief Global GPU memory array handle with input atom charges.
+    /*! \brief Global GPU memory array handle with input atom charges in states A and B.
      * The charges only need to be reallocated and copied to the GPU at DD step.
      */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coefficients;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coefficients[NUMFEPSTATES];
     /*! \brief Global GPU memory array handle with input/output rvec atom forces.
      * The forces change and need to be copied from (and possibly to) the GPU for every PME
      * computation, but reallocation happens only at DD.
@@ -196,6 +201,9 @@ struct PmeGpuDynamicParams
     float recipBox[DIM][DIM];
     /*! \brief The unit cell volume for solving. */
     float boxVolume;
+
+    /*! \brief The current coefficient scaling value. */
+    float scale;
 };
 
 /*! \internal \brief
diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h
index 9d7e2f78f9..f8e42f7fb6 100644
--- a/src/gromacs/ewald/pme_gpu_types_host.h
+++ b/src/gromacs/ewald/pme_gpu_types_host.h
@@ -100,7 +100,7 @@ struct DeviceInformation;
  */
 struct PmeShared
 {
-    /*! \brief Grid count - currently always 1 on GPU */
+    /*! \brief Grid count */
     int ngrids;
     /*! \brief Grid dimensions - nkx, nky, nkz */
     int nk[DIM];
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h
index e134d2c0a7..f3deae2842 100644
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -62,6 +62,11 @@
 
 #include "pme_gpu_3dfft.h"
 
+#ifndef NUMFEPSTATES
+//! Number of FEP states.
+#    define NUMFEPSTATES 2
+#endif
+
 class GpuParallel3dFft;
 
 /*! \internal \brief
@@ -141,21 +146,21 @@ struct PmeGpuSpecific
     /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */
     int splineDataSizeAlloc = 0;
     /*! \brief The kernelParams.atoms.coefficients float element count (actual) */
-    int coefficientsSize = 0;
+    int coefficientsSize[NUMFEPSTATES] = { 0, 0 };
     /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */
-    int coefficientsSizeAlloc = 0;
+    int coefficientsCapacity[NUMFEPSTATES] = { 0, 0 };
     /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */
-    int splineValuesSize = 0;
+    int splineValuesSize[NUMFEPSTATES] = { 0, 0 };
     /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
-    int splineValuesSizeAlloc = 0;
+    int splineValuesCapacity[NUMFEPSTATES] = { 0, 0 };
     /*! \brief The kernelParams.grid.realGrid float element count (actual) */
-    int realGridSize = 0;
+    int realGridSize[NUMFEPSTATES] = { 0, 0 };
     /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
-    int realGridSizeAlloc = 0;
+    int realGridCapacity[NUMFEPSTATES] = { 0, 0 };
     /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */
-    int complexGridSize = 0;
+    int complexGridSize[NUMFEPSTATES] = { 0, 0 };
     /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */
-    int complexGridSizeAlloc = 0;
+    int complexGridCapacity[NUMFEPSTATES] = { 0, 0 };
 };
 
 #endif
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index a6e4f23183..1a33cc8ed0 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -418,7 +418,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
         {
             if (atomSetChanged)
             {
-                gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data());
+                gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data(), pme_pp->chargeB.data());
                 if (useGpuForPme)
                 {
                     stateGpu->reinit(nat, nat);
@@ -707,7 +707,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
         stepWork.computeVirial = computeEnergyAndVirial;
         stepWork.computeEnergy = computeEnergyAndVirial;
         stepWork.computeForces = true;
-        PmeOutput output;
+        PmeOutput output       = { {}, false, 0, { { 0 } }, 0, 0, { { 0 } }, 0 };
         if (useGpuForPme)
         {
             stepWork.haveDynamicBox      = false;
@@ -723,10 +723,10 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
             // TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
             auto xReadyOnDevice = nullptr;
 
-            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
+            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q);
             pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
-            pme_gpu_launch_gather(pme, wcycle);
-            output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, wcycle);
+            pme_gpu_launch_gather(pme, wcycle, lambda_q);
+            output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
             pme_gpu_reinit_computation(pme, wcycle);
         }
         else
diff --git a/src/gromacs/ewald/pme_output.h b/src/gromacs/ewald/pme_output.h
index 6567341a1e..bc8a501fce 100644
--- a/src/gromacs/ewald/pme_output.h
+++ b/src/gromacs/ewald/pme_output.h
@@ -61,10 +61,14 @@ struct PmeOutput
     real coulombEnergy_ = 0;
     //!< Host staging area for PME coulomb virial contributions
     matrix coulombVirial_ = { { 0 } };
+    //!< Host staging area for PME coulomb dVdl.
+    real coulombDvdl_ = 0;
     //!< Host staging area for PME LJ energy
     real lennardJonesEnergy_ = 0;
     //!< Host staging area for PME LJ virial contributions
     matrix lennardJonesVirial_ = { { 0 } };
+    //!< Host staging area for PME LJ dVdl. (Not used)
+    real lennardJonesDvdl_ = 0;
 };
 
 #endif
diff --git a/src/gromacs/ewald/pme_program.cl b/src/gromacs/ewald/pme_program.cl
index 6b6aab6d2d..354aa12e58 100644
--- a/src/gromacs/ewald/pme_program.cl
+++ b/src/gromacs/ewald/pme_program.cl
@@ -59,31 +59,70 @@
 #define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom)
 #define atomsPerWarp (warp_size / threadsPerAtom)
 
-// spline/spread fused
+// spline/spread fused single grid (without FEP or FEP without energy and virial calculation at this step)
 #define computeSplines 1
 #define spreadCharges 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernel
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernelSingle
 #include "pme_spread.clh"
 #undef computeSplines
 #undef spreadCharges
+#undef numGrids
 #undef CUSTOMIZED_KERNEL_NAME
 
-// spline
+// spline on single grid (without FEP or FEP without energy and virial calculation at this step)
 #define computeSplines 1
 #define spreadCharges 0
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernel
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernelSingle
 #include "pme_spread.clh"
 #undef computeSplines
 #undef spreadCharges
+#undef numGrids
 #undef CUSTOMIZED_KERNEL_NAME
 
-// spread
+// spread on single grid (without FEP or FEP without energy and virial calculation at this step)
 #define computeSplines 0
 #define spreadCharges 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernel
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernelSingle
 #include "pme_spread.clh"
 #undef computeSplines
 #undef spreadCharges
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
+
+// spline/spread fused on two grids (FEP with energy and virial calculation at this step)
+#define computeSplines 1
+#define spreadCharges 1
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernelDual
+#include "pme_spread.clh"
+#undef computeSplines
+#undef spreadCharges
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
+
+// spline on two grids (FEP with energy and virial calculation at this step)
+#define computeSplines 1
+#define spreadCharges 0
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernelDual
+#include "pme_spread.clh"
+#undef computeSplines
+#undef spreadCharges
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
+
+// spread on two grids (FEP with energy and virial calculation at this step)
+#define computeSplines 0
+#define spreadCharges 1
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernelDual
+#include "pme_spread.clh"
+#undef computeSplines
+#undef spreadCharges
+#undef numGrids
 #undef CUSTOMIZED_KERNEL_NAME
 
 
@@ -94,11 +133,19 @@
 
 #define atomsPerBlock (c_gatherWorkGroupSize / threadsPerAtom)
 
-// gather
-#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernel
+// gather using single grid
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernelSingle
 #include "pme_gather.clh"
+#undef numGrids
 #undef CUSTOMIZED_KERNEL_NAME
 
+// gather using two grids
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernelDual
+#include "pme_gather.clh"
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
 
 #undef atomsPerBlock
 
@@ -108,37 +155,73 @@
 #define YZX 1
 #define XYZ 2
 
-// solve, YZX dimension order
+// solve, YZX dimension order state A
+#define gridOrdering YZX
+#define computeEnergyAndVirial 0
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve with reduction, YZX dimension order state A
+#define gridOrdering YZX
+#define computeEnergyAndVirial 1
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve, XYZ dimension order state A
+#define gridOrdering XYZ
+#define computeEnergyAndVirial 0
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve with reduction, XYZ dimension order state A
+#define gridOrdering XYZ
+#define computeEnergyAndVirial 1
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve, YZX dimension order state B
 #define gridOrdering YZX
 #define computeEnergyAndVirial 0
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernelB
 #include "pme_solve.clh"
 #undef gridOrdering
 #undef computeEnergyAndVirial
 #undef CUSTOMIZED_KERNEL_NAME
 
-// solve with reduction, YZX dimension order
+// solve with reduction, YZX dimension order state B
 #define gridOrdering YZX
 #define computeEnergyAndVirial 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernelB
 #include "pme_solve.clh"
 #undef gridOrdering
 #undef computeEnergyAndVirial
 #undef CUSTOMIZED_KERNEL_NAME
 
-// solve, XYZ dimension order
+// solve, XYZ dimension order state B
 #define gridOrdering XYZ
 #define computeEnergyAndVirial 0
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernelB
 #include "pme_solve.clh"
 #undef gridOrdering
 #undef computeEnergyAndVirial
 #undef CUSTOMIZED_KERNEL_NAME
 
-// solve with reduction, XYZ dimension order
+// solve with reduction, XYZ dimension order state B
 #define gridOrdering XYZ
 #define computeEnergyAndVirial 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernelB
 #include "pme_solve.clh"
 #undef gridOrdering
 #undef computeEnergyAndVirial
diff --git a/src/gromacs/ewald/pme_solve.clh b/src/gromacs/ewald/pme_solve.clh
index 2c4dbde609..0c21512b6f 100644
--- a/src/gromacs/ewald/pme_solve.clh
+++ b/src/gromacs/ewald/pme_solve.clh
@@ -60,7 +60,8 @@
  * \param[in]     kernelParams         Input PME GPU data in constant memory.
  * \param[in]     gm_splineModuli      B-Spline moduli.
  * \param[out]    gm_virialAndEnergy   Reduced virial and enrgy (only with computeEnergyAndVirial ==
- * true) \param[in,out] gm_grid              Fourier grid to transform.
+ * true)
+ * \param[in,out] gm_grid              Fourier grid to transform.
  */
 __attribute__((work_group_size_hint(c_solveMaxWorkGroupSize, 1, 1)))
 __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams,
@@ -137,8 +138,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
         & (gridLineIndex < gridLinesPerBlock))
     {
         /* The offset should be equal to the global thread index for coalesced access */
-        const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
-        __global float2* __restrict__ gm_gridCell = gm_grid + gridIndex;
+        const int gridThreadIndex =
+                (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
+        __global float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex;
 
         const int kMajor = indexMajor + localOffsetMajor;
         /* Checking either X in XYZ, or Y in YZX cases */
diff --git a/src/gromacs/ewald/pme_solve.cu b/src/gromacs/ewald/pme_solve.cu
index 784de81f96..3f5d2d06f4 100644
--- a/src/gromacs/ewald/pme_solve.cu
+++ b/src/gromacs/ewald/pme_solve.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -53,10 +53,11 @@
  * PME complex grid solver kernel function.
  *
  * \tparam[in] gridOrdering             Specifies the dimension ordering of the complex grid.
- * \tparam[in] computeEnergyAndVirial   Tells if the reciprocal energy and virial should be
- * computed. \param[in]  kernelParams             Input PME CUDA data in constant memory.
+ * \tparam[in] computeEnergyAndVirial   Tells if the reciprocal energy and virial should be computed.
+ * \tparam[in] gridIndex                The index of the grid to use in the kernel.
+ * \param[in]  kernelParams             Input PME CUDA data in constant memory.
  */
-template<GridOrdering gridOrdering, bool computeEnergyAndVirial>
+template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex>
 __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__
         void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParams)
 {
@@ -80,14 +81,14 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
     }
 
     /* Global memory pointers */
-    const float* __restrict__ gm_splineValueMajor =
-            kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[majorDim];
-    const float* __restrict__ gm_splineValueMiddle =
-            kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[middleDim];
-    const float* __restrict__ gm_splineValueMinor =
-            kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
-    float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy;
-    float2* __restrict__ gm_grid           = (float2*)kernelParams.grid.d_fourierGrid;
+    const float* __restrict__ gm_splineValueMajor = kernelParams.grid.d_splineModuli[gridIndex]
+                                                    + kernelParams.grid.splineValuesOffset[majorDim];
+    const float* __restrict__ gm_splineValueMiddle = kernelParams.grid.d_splineModuli[gridIndex]
+                                                     + kernelParams.grid.splineValuesOffset[middleDim];
+    const float* __restrict__ gm_splineValueMinor = kernelParams.grid.d_splineModuli[gridIndex]
+                                                    + kernelParams.grid.splineValuesOffset[minorDim];
+    float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy[gridIndex];
+    float2* __restrict__ gm_grid           = (float2*)kernelParams.grid.d_fourierGrid[gridIndex];
 
     /* Various grid sizes and indices */
     const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
@@ -131,8 +132,9 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
         & (gridLineIndex < gridLinesPerBlock))
     {
         /* The offset should be equal to the global thread index for coalesced access */
-        const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
-        float2* __restrict__ gm_gridCell = gm_grid + gridIndex;
+        const int gridThreadIndex =
+                (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
+        float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex;
 
         const int kMajor = indexMajor + localOffsetMajor;
         /* Checking either X in XYZ, or Y in YZX cases */
@@ -352,7 +354,11 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
 }
 
 //! Kernel instantiations
-template __global__ void pme_solve_kernel<GridOrdering::YZX, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
-template __global__ void pme_solve_kernel<GridOrdering::XYZ, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, true, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, false, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, true, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, false, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, true, 1>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, false, 1>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, true, 1>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, false, 1>(const PmeGpuCudaKernelParams);
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh
index 91e36c2518..3342120d51 100644
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -75,7 +75,8 @@
  *
  * \param[out] sm_destination    Local memory array for output.
  * \param[in]  gm_source         Global memory array for input.
- * \param[in] dataCountPerAtom   Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in]  dataCountPerAtom  Number of data elements per single
+ * atom (e.g. DIM for an rvec coordinates array).
  *
  */
 inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination,
@@ -113,7 +114,7 @@ inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination,
  * \param[out] gm_dtheta                Atom spline parameter derivatives
  * \param[out] gm_gridlineIndices       Same as \p sm_gridlineIndices but in global memory.
  * \param[in]  gm_fractShiftsTable      Atom fractional coordinates correction table
- * \param[in]  gm_gridlineIndicesTable  Atom fractional coordinates correction table
+ * \param[in]  gm_gridlineIndicesTable  Table with atom gridline indices
  */
 gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
                                          const int                          atomIndexOffset,
@@ -404,10 +405,17 @@ gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelP
  * \param[in,out] gm_theta                 Atom spline parameter values
  * \param[out]    gm_dtheta                Atom spline parameter derivatives
  * \param[in,out] gm_gridlineIndices       Atom gridline indices (ivec)
- * \param[out]    gm_grid                  Global 3D grid for charge spreading.
+ * \param[out]    gm_gridA                 Global 3D grid for charge spreading.
+ * Grid for the unperturbed state, FEP state A or the single grid used for
+ * interpolated coefficients on one grid in FEP A/B.
+ * \param[out]    gm_gridB                 Global 3D grid for charge spreading.
+ * FEP state B when using dual grids (when calculating energy and virials).
  * \param[in]     gm_fractShiftsTable      Atom fractional coordinates correction table
- * \param[in]     gm_gridlineIndicesTable  Atom fractional coordinates correction table
- * \param[in]     gm_coefficients          Atom charges/coefficients.
+ * \param[in]     gm_gridlineIndicesTable  Table with atom gridline indices
+ * \param[in]     gm_coefficientsA         Atom charges/coefficients of the unperturbed state
+ * or FEP state A.
+ * \param[in]     gm_coefficientsB         Atom charges/coefficients in FEP state B. Only
+ * used when spreading interpolated coefficients on one grid.
  * \param[in]     gm_coordinates           Atom coordinates (rvec)
  */
 __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void CUSTOMIZED_KERNEL_NAME(
@@ -415,10 +423,12 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
                                       __global float* __restrict__ gm_theta,
                                       __global float* __restrict__ gm_dtheta,
                                       __global int* __restrict__ gm_gridlineIndices,
-                                      __global float* __restrict__ gm_grid,
+                                      __global float* __restrict__ gm_gridA,
+                                      __global float* __restrict__ gm_gridB,
                                       __global const float* __restrict__ gm_fractShiftsTable,
                                       __global const int* __restrict__ gm_gridlineIndicesTable,
-                                      __global const float* __restrict__ gm_coefficients,
+                                      __global const float* __restrict__ gm_coefficientsA,
+                                      __global const float* __restrict__ gm_coefficientsB,
                                       __global const float* __restrict__ gm_coordinates)
 {
     // Gridline indices, ivec
@@ -433,9 +443,10 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
     __local float sm_coordinates[DIM * atomsPerBlock];
 
     const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
+    int       gridIndex       = 0;
 
     /* Staging coefficients/charges for both spline and spread */
-    pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients, 1);
+    pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsA, 1);
 
     if (computeSplines)
     {
@@ -470,6 +481,17 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
     /* Spreading */
     if (spreadCharges)
     {
-        spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_grid);
+        spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridA);
+    }
+    if (numGrids == 2)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        gridIndex = 1;
+        pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsB, 1);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (spreadCharges)
+        {
+            spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridB);
+        }
     }
 }
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu
index 806dddd4f8..3cf3a1f7ca 100644
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -53,6 +53,17 @@
 #include "pme_gpu_calculate_splines.cuh"
 #include "pme_grid.h"
 
+/*
+ * This define affects the spline calculation behaviour in the kernel.
+ * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing
+ * (order) spline values and derivatives). 1: (order) threads do redundant work on this same task,
+ * each one stores only a single theta and single dtheta into global arrays. The only efficiency
+ * difference is less global store operations, countered by more redundant spline computation.
+ *
+ * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
+ */
+#define PME_GPU_PARALLEL_SPLINE 0
+
 /*! \brief
  * Charge spreading onto the grid.
  * This corresponds to the CPU function spread_coefficients_bsplines_thread().
@@ -61,6 +72,7 @@
  * \tparam[in] order                PME interpolation order.
  * \tparam[in] wrapX                Whether the grid overlap in dimension X should be wrapped.
  * \tparam[in] wrapY                Whether the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] gridIndex            The index of the grid to use in the kernel.
  * \tparam[in] threadsPerAtom       How many threads work on each atom
  *
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
@@ -68,14 +80,14 @@
  * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
  * \param[in]  sm_theta             Atom spline values in the shared memory.
  */
-template<int order, bool wrapX, bool wrapY, ThreadsPerAtom threadsPerAtom>
+template<int order, bool wrapX, bool wrapY, int gridIndex, ThreadsPerAtom threadsPerAtom>
 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
                                                const float*                 atomCharge,
                                                const int* __restrict__ sm_gridlineIndices,
                                                const float* __restrict__ sm_theta)
 {
     /* Global memory pointer to the output grid */
-    float* __restrict__ gm_grid = kernelParams.grid.d_realGrid;
+    float* __restrict__ gm_grid = kernelParams.grid.d_realGrid[gridIndex];
 
     // Number of atoms processed by a single warp in spread and gather
     const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
@@ -163,11 +175,12 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kern
  * \tparam[in] spreadCharges        A boolean which tells if the charge spreading should be performed.
  * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
  * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] numGrids             The number of grids to use in the kernel. Can be 1 or 2.
  * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory.
  * \tparam[in] threadsPerAtom       How many threads work on each atom
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  */
-template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
 __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__
         void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
 {
@@ -177,6 +190,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     const int atomsPerWarp = warp_size / threadsPerAtomValue;
     // Gridline indices, ivec
     __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
+    // Charges
+    __shared__ float sm_coefficients[atomsPerBlock];
     // Spline values
     __shared__ float sm_theta[atomsPerBlock * DIM * order];
     float            dtheta;
@@ -210,14 +225,14 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     /* Charges, required for both spline and spread */
     if (c_useAtomDataPrefetch)
     {
-        __shared__ float sm_coefficients[atomsPerBlock];
-        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, kernelParams.atoms.d_coefficients);
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
+                                                         kernelParams.atoms.d_coefficients[0]);
         __syncthreads();
         atomCharge = sm_coefficients[atomIndexLocal];
     }
     else
     {
-        atomCharge = kernelParams.atoms.d_coefficients[atomIndexGlobal];
+        atomCharge = kernelParams.atoms.d_coefficients[0][atomIndexGlobal];
     }
 
     if (computeSplines)
@@ -259,19 +274,47 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
     /* Spreading */
     if (spreadCharges)
     {
-        spread_charges<order, wrapX, wrapY, threadsPerAtom>(kernelParams, &atomCharge,
-                                                            sm_gridlineIndices, sm_theta);
+        spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(kernelParams, &atomCharge,
+                                                               sm_gridlineIndices, sm_theta);
+    }
+    if (numGrids == 2)
+    {
+        __syncthreads();
+        if (c_useAtomDataPrefetch)
+        {
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
+                                                             kernelParams.atoms.d_coefficients[1]);
+            __syncthreads();
+            atomCharge = sm_coefficients[atomIndexLocal];
+        }
+        else
+        {
+            atomCharge = kernelParams.atoms.d_coefficients[1][atomIndexGlobal];
+        }
+        if (spreadCharges)
+        {
+            spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(kernelParams, &atomCharge,
+                                                                   sm_gridlineIndices, sm_theta);
+        }
     }
 }
 
 //! Kernel instantiations
 // clang-format off
-template __global__ void pme_spline_and_spread_kernel<4, true,  true,  true, true, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true,  false, true, true, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, false, true,  true, true, true,  ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true,  true,  true, true, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true,  true,  true, true, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true,  false, true, true, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, false, true,  true, true, true,  ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true,  true,  true, true, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 // clang-format on
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index 888cac5873..2e28fb4aa2 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -81,15 +81,14 @@ namespace test
 
 bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode)
 {
-    bool       implemented;
-    gmx_mtop_t mtop;
+    bool implemented;
     switch (mode)
     {
         case CodePath::CPU: implemented = true; break;
 
         case CodePath::GPU:
             implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr)
-                           && pme_gpu_supports_input(*inputRec, mtop, nullptr));
+                           && pme_gpu_supports_input(*inputRec, nullptr));
             break;
 
         default: GMX_THROW(InternalError("Test not implemented for this mode"));
@@ -202,7 +201,7 @@ void pmeInitAtoms(gmx_pme_t*               pme,
             atc              = &(pme->atc[0]);
             atc->x           = coordinates;
             atc->coefficient = charges;
-            gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+            gmx_pme_reinit_atoms(pme, atomCount, charges.data(), nullptr);
             /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
             break;
 
@@ -211,7 +210,7 @@ void pmeInitAtoms(gmx_pme_t*               pme,
             atc = &(pme->atc[0]);
             // We need to set atc->n for passing the size in the tests
             atc->setNumAtoms(atomCount);
-            gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+            gmx_pme_reinit_atoms(pme, atomCount, charges.data(), nullptr);
 
             stateGpu->reinit(atomCount, atomCount);
             stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()),
@@ -314,28 +313,35 @@ void pmePerformSplineAndSpread(gmx_pme_t* pme,
     PmeAtomComm* atc                          = &(pme->atc[0]);
     const size_t gridIndex                    = 0;
     const bool   computeSplinesForZeroCharges = true;
-    real*        fftgrid                      = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
+    real**       fftgrid                      = spreadCharges ? pme->fftgrid : nullptr;
     real*        pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
 
     switch (mode)
     {
         case CodePath::CPU:
             spread_on_grid(pme, atc, &pme->pmegrid[gridIndex], computeSplines, spreadCharges,
-                           fftgrid, computeSplinesForZeroCharges, gridIndex);
+                           fftgrid != nullptr ? fftgrid[gridIndex] : nullptr,
+                           computeSplinesForZeroCharges, gridIndex);
             if (spreadCharges && !pme->bUseThreads)
             {
                 wrap_periodic_pmegrid(pme, pmegrid);
-                copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
+                copy_pmegrid_to_fftgrid(
+                        pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex);
             }
             break;
 
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
         case CodePath::GPU:
         {
+            const real lambdaQ = 1.0;
             // no synchronization needed as x is transferred in the PME stream
             GpuEventSynchronizer* xReadyOnDevice = nullptr;
-            pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+            pme_gpu_spread(pme->gpu, xReadyOnDevice, fftgrid, computeSplines, spreadCharges, lambdaQ);
         }
         break;
+#endif
 
         default: GMX_THROW(InternalError("Test not implemented for this mode"));
     }
@@ -374,6 +380,7 @@ void pmePerformSolve(const gmx_pme_t*  pme,
     t_complex*   h_grid              = pmeGetComplexGridInternal(pme);
     const bool   useLorentzBerthelot = false;
     const size_t threadIndex         = 0;
+    const size_t gridIndex           = 0;
     switch (mode)
     {
         case CodePath::CPU:
@@ -400,7 +407,7 @@ void pmePerformSolve(const gmx_pme_t*  pme,
             switch (method)
             {
                 case PmeSolveAlgorithm::Coulomb:
-                    pme_gpu_solve(pme->gpu, h_grid, gridOrdering, computeEnergyAndVirial);
+                    pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial);
                     break;
 
                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
@@ -421,7 +428,7 @@ void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces)
     const size_t threadIndex = 0;
     const size_t gridIndex   = 0;
     real*        pmegrid     = pme->pmegrid[gridIndex].grid.grid;
-    real*        fftgrid     = pme->fftgrid[gridIndex];
+    real**       fftgrid     = pme->fftgrid;
 
     switch (mode)
     {
@@ -432,22 +439,27 @@ void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces)
                 // something which is normally done in serial spline computation (make_thread_local_ind())
                 atc->spline[threadIndex].n = atomCount;
             }
-            copy_fftgrid_to_pmegrid(pme, fftgrid, pmegrid, gridIndex, pme->nthread, threadIndex);
+            copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex);
             unwrap_periodic_pmegrid(pme, pmegrid);
             gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale);
             break;
 
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
         case CodePath::GPU:
         {
             // Variable initialization needs a non-switch scope
             const bool computeEnergyAndVirial = false;
-            PmeOutput  output                 = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+            const real lambdaQ                = 1.0;
+            PmeOutput  output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ);
             GMX_ASSERT(forces.size() == output.forces_.size(),
                        "Size of force buffers did not match");
-            pme_gpu_gather(pme->gpu, reinterpret_cast<float*>(fftgrid));
+            pme_gpu_gather(pme->gpu, fftgrid, lambdaQ);
             std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
         }
         break;
+#endif
 
         default: GMX_THROW(InternalError("Test not implemented for this mode"));
     }
@@ -843,7 +855,8 @@ SparseComplexGridValuesOutput pmeGetComplexGrid(const gmx_pme_t* pme, CodePath m
 //! Getting the reciprocal energy and virial
 PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method)
 {
-    PmeOutput output;
+    PmeOutput  output;
+    const real lambdaQ = 1.0;
     switch (mode)
     {
         case CodePath::CPU:
@@ -863,7 +876,9 @@ PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, P
         case CodePath::GPU:
             switch (method)
             {
-                case PmeSolveAlgorithm::Coulomb: pme_gpu_getEnergyAndVirial(*pme, &output); break;
+                case PmeSolveAlgorithm::Coulomb:
+                    pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output);
+                    break;
 
                 default: GMX_THROW(InternalError("Test not implemented for this mode"));
             }
diff --git a/src/gromacs/mdlib/mdatoms.cpp b/src/gromacs/mdlib/mdatoms.cpp
index f40b2d24af..0d1abd8565 100644
--- a/src/gromacs/mdlib/mdatoms.cpp
+++ b/src/gromacs/mdlib/mdatoms.cpp
@@ -75,8 +75,9 @@ MDAtoms::~MDAtoms()
     gmx::AlignedAllocationPolicy::free(mdatoms_->invmass);
     sfree(mdatoms_->invMassPerDim);
     sfree(mdatoms_->typeA);
-    sfree(mdatoms_->chargeB);
     sfree(mdatoms_->typeB);
+    /* mdatoms->chargeA and mdatoms->chargeB point at chargeA_.data()
+     * and chargeB_.data() respectively. They get freed automatically. */
     sfree(mdatoms_->sqrt_c6A);
     sfree(mdatoms_->sigmaA);
     sfree(mdatoms_->sigma3A);
@@ -95,18 +96,30 @@ MDAtoms::~MDAtoms()
     sfree(mdatoms_->cU2);
 }
 
-void MDAtoms::resize(int newSize)
+void MDAtoms::resizeChargeA(const int newSize)
 {
     chargeA_.resizeWithPadding(newSize);
     mdatoms_->chargeA = chargeA_.data();
 }
 
-void MDAtoms::reserve(int newCapacity)
+void MDAtoms::resizeChargeB(const int newSize)
+{
+    chargeB_.resizeWithPadding(newSize);
+    mdatoms_->chargeB = chargeB_.data();
+}
+
+void MDAtoms::reserveChargeA(const int newCapacity)
 {
     chargeA_.reserveWithPadding(newCapacity);
     mdatoms_->chargeA = chargeA_.data();
 }
 
+void MDAtoms::reserveChargeB(const int newCapacity)
+{
+    chargeB_.reserveWithPadding(newCapacity);
+    mdatoms_->chargeB = chargeB_.data();
+}
+
 std::unique_ptr<MDAtoms> makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_inputrec& ir, const bool rankHasPmeGpuTask)
 {
     auto mdAtoms = std::make_unique<MDAtoms>();
@@ -114,6 +127,7 @@ std::unique_ptr<MDAtoms> makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_i
     if (rankHasPmeGpuTask)
     {
         changePinningPolicy(&mdAtoms->chargeA_, pme_get_pinning_policy());
+        changePinningPolicy(&mdAtoms->chargeB_, pme_get_pinning_policy());
     }
     t_mdatoms* md;
     snew(md, 1);
@@ -242,12 +256,16 @@ void atoms2md(const gmx_mtop_t*  mtop,
         // everything, but for now the semantics of md->nalloc being
         // the capacity are preserved by keeping vectors within
         // mdAtoms having the same properties as the other arrays.
-        mdAtoms->reserve(md->nalloc);
-        mdAtoms->resize(md->nr);
+        mdAtoms->reserveChargeA(md->nalloc);
+        mdAtoms->resizeChargeA(md->nr);
+        if (md->nPerturbed > 0)
+        {
+            mdAtoms->reserveChargeB(md->nalloc);
+            mdAtoms->resizeChargeB(md->nr);
+        }
         srenew(md->typeA, md->nalloc);
         if (md->nPerturbed)
         {
-            srenew(md->chargeB, md->nalloc);
             srenew(md->typeB, md->nalloc);
         }
         if (bLJPME)
diff --git a/src/gromacs/mdlib/mdatoms.h b/src/gromacs/mdlib/mdatoms.h
index c9895b9916..44dbcbba23 100644
--- a/src/gromacs/mdlib/mdatoms.h
+++ b/src/gromacs/mdlib/mdatoms.h
@@ -69,6 +69,8 @@ class MDAtoms
     unique_cptr<t_mdatoms> mdatoms_;
     //! Memory for chargeA that can be set up for efficient GPU transfer.
     gmx::PaddedHostVector<real> chargeA_;
+    //! Memory for chargeB that can be set up for efficient GPU transfer.
+    gmx::PaddedHostVector<real> chargeB_;
 
 public:
     // TODO make this private
@@ -78,16 +80,26 @@ public:
     t_mdatoms* mdatoms() { return mdatoms_.get(); }
     //! Const getter.
     const t_mdatoms* mdatoms() const { return mdatoms_.get(); }
-    /*! \brief Resizes memory.
+    /*! \brief Resizes memory for charges of FEP state A.
      *
      * \throws std::bad_alloc  If out of memory.
      */
-    void resize(int newSize);
-    /*! \brief Reserves memory.
+    void resizeChargeA(int newSize);
+    /*! \brief Resizes memory for charges of FEP state B.
      *
      * \throws std::bad_alloc  If out of memory.
      */
-    void reserve(int newCapacity);
+    void resizeChargeB(int newSize);
+    /*! \brief Reserves memory for charges of FEP state A.
+     *
+     * \throws std::bad_alloc  If out of memory.
+     */
+    void reserveChargeA(int newCapacity);
+    /*! \brief Reserves memory for charges of FEP state B.
+     *
+     * \throws std::bad_alloc  If out of memory.
+     */
+    void reserveChargeB(int newCapacity);
     //! Builder function.
     friend std::unique_ptr<MDAtoms>
     makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_inputrec& ir, bool rankHasPmeGpuTask);
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 5159f2438d..611ca913dc 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -673,17 +673,19 @@ static void computeSpecialForces(FILE*                          fplog,
  * \param[in]  pmedata              The PME structure
  * \param[in]  box                  The box matrix
  * \param[in]  stepWork             Step schedule flags
- * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in
- * the device memory. \param[in]  wcycle               The wallcycle structure
+ * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
+ * \param[in]  lambdaQ              The Coulomb lambda of the current state.
+ * \param[in]  wcycle               The wallcycle structure
  */
 static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                       const matrix          box,
                                       const StepWorkload&   stepWork,
                                       GpuEventSynchronizer* xReadyOnDevice,
+                                      const real            lambdaQ,
                                       gmx_wallcycle_t       wcycle)
 {
     pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
-    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
+    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
 }
 
 /*! \brief Launch the FFT and gather stages of PME GPU
@@ -691,13 +693,17 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
  * This function only implements setting the output forces (no accumulation).
  *
  * \param[in]  pmedata        The PME structure
+ * \param[in]  lambdaQ        The Coulomb lambda of the current system state.
  * \param[in]  wcycle         The wallcycle structure
  * \param[in]  stepWork       Step schedule flags
  */
-static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle, const gmx::StepWorkload& stepWork)
+static void launchPmeGpuFftAndGather(gmx_pme_t*               pmedata,
+                                     const real               lambdaQ,
+                                     gmx_wallcycle_t          wcycle,
+                                     const gmx::StepWorkload& stepWork)
 {
     pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
-    pme_gpu_launch_gather(pmedata, wcycle);
+    pme_gpu_launch_gather(pmedata, wcycle, lambdaQ);
 }
 
 /*! \brief
@@ -713,6 +719,7 @@ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle,
  * \param[in,out] pmedata          PME module data
  * \param[in,out] forceOutputs     Output buffer for the forces and virial
  * \param[in,out] enerd            Energy data structure results are reduced into
+ * \param[in]     lambdaQ          The Coulomb lambda of the current system state.
  * \param[in]     stepWork         Step schedule flags
  * \param[in]     wcycle           The wallcycle structure
  */
@@ -720,6 +727,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
                                         gmx_pme_t*          pmedata,
                                         gmx::ForceOutputs*  forceOutputs,
                                         gmx_enerdata_t*     enerd,
+                                        const real          lambdaQ,
                                         const StepWorkload& stepWork,
                                         gmx_wallcycle_t     wcycle)
 {
@@ -739,7 +747,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
             GpuTaskCompletion completionType =
                     (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
             isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle, &forceWithVirial,
-                                                   enerd, completionType);
+                                                   enerd, lambdaQ, completionType);
         }
 
         if (!isNbGpuDone)
@@ -1173,7 +1181,7 @@ void do_force(FILE*                               fplog,
 
     if (useGpuPmeOnThisRank)
     {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle);
     }
 
     /* do gridding for pair search */
@@ -1330,7 +1338,7 @@ void do_force(FILE*                               fplog,
         // X copy/transform to allow overlap as well as after the GPU NB
         // launch to avoid FFT launch overhead hijacking the CPU and delaying
         // the nonbonded kernel.
-        launchPmeGpuFftAndGather(fr->pmedata, wcycle, stepWork);
+        launchPmeGpuFftAndGather(fr->pmedata, lambda[efptCOUL], wcycle, stepWork);
     }
 
     /* Communicate coordinates and sum dipole if necessary +
@@ -1685,12 +1693,14 @@ void do_force(FILE*                               fplog,
                              && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
     if (alternateGpuWait)
     {
-        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, wcycle);
+        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, lambda[efptCOUL],
+                                    stepWork, wcycle);
     }
 
     if (!alternateGpuWait && useGpuPmeOnThisRank)
     {
-        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd);
+        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd,
+                                lambda[efptCOUL]);
     }
 
     /* Wait for local GPU NB outputs on the non-alternating wait path */
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index c9315bfc5c..b90bc9aea5 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -812,7 +812,7 @@ int Mdrunner::mdrunner()
                     hw_opt.nthreads_tmpi);
             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
                     useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
-                    *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+                    *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 
@@ -878,7 +878,7 @@ int Mdrunner::mdrunner()
                 nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
                 gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
         useGpuForPme = decideWhetherToUseGpusForPme(
-                useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec, mtop,
+                useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec,
                 cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected);
         auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr)
                                   && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
diff --git a/src/gromacs/mdrun/tpi.cpp b/src/gromacs/mdrun/tpi.cpp
index 78305b2d10..3e780cade8 100644
--- a/src/gromacs/mdrun/tpi.cpp
+++ b/src/gromacs/mdrun/tpi.cpp
@@ -320,7 +320,7 @@ void LegacySimulator::do_tpi()
 
     if (EEL_PME(fr->ic->eeltype))
     {
-        gmx_pme_reinit_atoms(fr->pmedata, a_tp0, nullptr);
+        gmx_pme_reinit_atoms(fr->pmedata, a_tp0, nullptr, nullptr);
     }
 
     /* With reacion-field we have distance dependent potentials
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index 2f9d2baa85..e8a9e697f6 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -154,14 +154,12 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuFor
                                                const std::vector<int>& userGpuTaskAssignment,
                                                const gmx_hw_info_t&    hardwareInfo,
                                                const t_inputrec&       inputrec,
-                                               const gmx_mtop_t&       mtop,
                                                const int               numRanksPerSimulation,
                                                const int               numPmeRanksPerSimulation)
 {
     // First, exclude all cases where we can't run PME on GPUs.
     if ((pmeTarget == TaskTarget::Cpu) || !useGpuForNonbonded || !pme_gpu_supports_build(nullptr)
-        || !pme_gpu_supports_hardware(hardwareInfo, nullptr)
-        || !pme_gpu_supports_input(inputrec, mtop, nullptr))
+        || !pme_gpu_supports_hardware(hardwareInfo, nullptr) || !pme_gpu_supports_input(inputrec, nullptr))
     {
         // PME can't run on a GPU. If the user required that, we issue
         // an error later.
@@ -328,7 +326,6 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                   const std::vector<int>& userGpuTaskAssignment,
                                   const gmx_hw_info_t&    hardwareInfo,
                                   const t_inputrec&       inputrec,
-                                  const gmx_mtop_t&       mtop,
                                   const int               numRanksPerSimulation,
                                   const int               numPmeRanksPerSimulation,
                                   const bool              gpusWereDetected)
@@ -365,7 +362,7 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
         }
         return false;
     }
-    if (!pme_gpu_supports_input(inputrec, mtop, &message))
+    if (!pme_gpu_supports_input(inputrec, &message))
     {
         if (pmeTarget == TaskTarget::Gpu)
         {
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h
index bfb002547a..5d15244416 100644
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -136,7 +136,6 @@ bool decideWhetherToUseGpusForNonbondedWithThreadMpi(TaskTarget              non
  * \param[in]  userGpuTaskAssignment     The user-specified assignment of GPU tasks to device IDs.
  * \param[in]  hardwareInfo              Hardware information
  * \param[in]  inputrec                  The user input
- * \param[in]  mtop                      Global system topology
  * \param[in]  numRanksPerSimulation     The number of ranks in each simulation.
  * \param[in]  numPmeRanksPerSimulation  The number of PME ranks in each simulation.
  *
@@ -150,7 +149,6 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(bool                    useGpuFor
                                                const std::vector<int>& userGpuTaskAssignment,
                                                const gmx_hw_info_t&    hardwareInfo,
                                                const t_inputrec&       inputrec,
-                                               const gmx_mtop_t&       mtop,
                                                int                     numRanksPerSimulation,
                                                int                     numPmeRanksPerSimulation);
 
@@ -209,7 +207,6 @@ bool decideWhetherToUseGpusForNonbonded(TaskTarget              nonbondedTarget,
  * \param[in]  userGpuTaskAssignment     The user-specified assignment of GPU tasks to device IDs.
  * \param[in]  hardwareInfo              Hardware information
  * \param[in]  inputrec                  The user input
- * \param[in]  mtop                      Global system topology
  * \param[in]  numRanksPerSimulation     The number of ranks in each simulation.
  * \param[in]  numPmeRanksPerSimulation  The number of PME ranks in each simulation.
  * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
@@ -223,7 +220,6 @@ bool decideWhetherToUseGpusForPme(bool                    useGpuForNonbonded,
                                   const std::vector<int>& userGpuTaskAssignment,
                                   const gmx_hw_info_t&    hardwareInfo,
                                   const t_inputrec&       inputrec,
-                                  const gmx_mtop_t&       mtop,
                                   int                     numRanksPerSimulation,
                                   int                     numPmeRanksPerSimulation,
                                   bool                    gpusWereDetected);
diff --git a/src/gromacs/taskassignment/resourcedivision.cpp b/src/gromacs/taskassignment/resourcedivision.cpp
index 6d061c94e5..ac44344003 100644
--- a/src/gromacs/taskassignment/resourcedivision.cpp
+++ b/src/gromacs/taskassignment/resourcedivision.cpp
@@ -360,7 +360,7 @@ int get_nthreads_mpi(const gmx_hw_info_t*    hwinfo,
         GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
                                    && pme_gpu_supports_build(nullptr)
                                    && pme_gpu_supports_hardware(*hwinfo, nullptr)
-                                   && pme_gpu_supports_input(*inputrec, *mtop, nullptr),
+                                   && pme_gpu_supports_input(*inputrec, nullptr),
                            "PME can't be on GPUs unless we are using PME");
 
         // PME on GPUs supports a single PME rank with PP running on the same or few other ranks.
-- 
2.22.0