From a2084d8287a631ba554fd4fd674f3502d9746c90 Mon Sep 17 00:00:00 2001
From: Magnus Lundborg <magnus.lundborg@scilifelab.se>
Date: Mon, 13 Sep 2021 18:08:15 +0000
Subject: [PATCH] Calculate FEP (GPU) splines for all atoms.

Since charges can be 0 in one state, but not in the other, the splines
still need to be calculated. This was already ensured by c_skipNeutralAtoms
being false, but this commit makes that more clear.

Refs: #4139
---
 src/gromacs/ewald/pme_gather.cu                |  4 ++--
 .../ewald/pme_gpu_calculate_splines.cuh        | 18 +++++++++++++-----
 src/gromacs/ewald/pme_spread.clh               |  7 +++++--
 src/gromacs/ewald/pme_spread.cu                |  4 ++--
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index 160f544d3e..40eea2d2dc 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -459,7 +459,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
             atomX      = gm_coordinates[atomIndexGlobal];
             atomCharge = gm_coefficientsA[atomIndexGlobal];
         }
-        calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false, numGrids>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
         __syncwarp();
     }
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
index fc319eb1c2..1e3ccefb01 100644
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -173,8 +173,12 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_desti
  * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
  *                                  in the sizes of the shared memory arrays.
  * \tparam[in] atomsPerWarp         Number of atoms processed by a warp
- * \tparam[in] writeSmDtheta        Bool controling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
- * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeSmDtheta        Bool controling if the theta derivative should be written to
+ *                                  shared memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should
+ *                                  be written to global memory. Enables calculation of dtheta if
+ *                                  set.
+ * \tparam[in] numGrids             The number of grids using the splines.
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
  * \param[in]  atomX                Atom coordinate of atom processed by thread.
@@ -184,7 +188,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_desti
  * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
  */
 
-template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal>
+template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids>
 __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams kernelParams,
                                                   const int                    atomIndexOffset,
                                                   const float3                 atomX,
@@ -193,6 +197,9 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                                                   float* __restrict__ sm_dtheta,
                                                   int* __restrict__ sm_gridlineIndices)
 {
+    assert(numGrids == 1 || numGrids == 2);
+    assert(numGrids == 1 || c_skipNeutralAtoms == false);
+
     /* Global memory pointers for output */
     float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
     float* __restrict__ gm_dtheta        = kernelParams.atoms.d_dtheta;
@@ -293,7 +300,8 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
         /* B-spline calculation */
 
         const int chargeCheck = pme_gpu_check_atom_charge(atomCharge);
-        if (chargeCheck)
+        /* With FEP (numGrids == 2), we might have 0 charge in state A, but !=0 in state B, so we always calculate splines */
+        if (numGrids == 2 || chargeCheck)
         {
             float div;
             int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh
index 9e887a9e6e..7f6f4beccd 100644
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -132,6 +132,8 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
     /* Thread index w.r.t. block */
     assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
            < MAX_INT);
+    assert(numGrids == 1 || numGrids == 2);
+    assert(numGrids == 1 || c_skipNeutralAtoms == false);
     const int threadLocalIndex =
             (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
                   + get_local_id(0));
@@ -238,7 +240,8 @@ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kern
         /* B-spline calculation */
 
         const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
-        if (chargeCheck)
+        /* With FEP (numGrids == 2), we might have 0 charge in state A, but !=0 in state B, so we always calculate splines */
+        if (numGrids == 2 || chargeCheck)
         {
             int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
 
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu
index 3cf3a1f7ca..38f41ec5b2 100644
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2013-2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -252,7 +252,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
         {
             atomX = gm_coordinates[atomIndexGlobal];
         }
-        calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>(
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal, numGrids>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
         __syncwarp();
     }
-- 
2.22.0