Merge remote-tracking branch 'origin/release-2021' into master

[alexxy/gromacs.git] / src / gromacs / ewald / pme_gpu_calculate_splines.cuh
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh

index 6e663175e0eefd05b04bed49d7616b6fecbd97f8..1ff60f7ed17497fceda74771c9269667f3775df6 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -173,8 +173,12 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_desti
   * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
   *                                  in the sizes of the shared memory arrays.
   * \tparam[in] atomsPerWarp         Number of atoms processed by a warp
- * \tparam[in] writeSmDtheta        Bool controlling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
- * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeSmDtheta        Bool controlling if the theta derivative should be written to
+ *                                  shared memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should
+ *                                  be written to global memory. Enables calculation of dtheta if
+ *                                  set.
+ * \tparam[in] numGrids             The number of grids using the splines.
   * \param[in]  kernelParams         Input PME CUDA data in constant memory.
   * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
   * \param[in]  atomX                Atom coordinate of atom processed by thread.
@@ -184,7 +188,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_desti
   * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
   */
  
-template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal>
+template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids>
  __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams kernelParams,
                                                    const int                    atomIndexOffset,
                                                    const float3                 atomX,
@@ -193,6 +197,9 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                                                    float* __restrict__ sm_dtheta,
                                                    int* __restrict__ sm_gridlineIndices)
  {
+    assert(numGrids == 1 || numGrids == 2);
+    assert(numGrids == 1 || c_skipNeutralAtoms == false);
+
      /* Global memory pointers for output */
      float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
      float* __restrict__ gm_dtheta        = kernelParams.atoms.d_dtheta;
@@ -293,7 +300,8 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
          /* B-spline calculation */
  
          const int chargeCheck = pme_gpu_check_atom_charge(atomCharge);
-        if (chargeCheck)
+        /* With FEP (numGrids == 2), we might have 0 charge in state A, but !=0 in state B, so we always calculate splines */
+        if (numGrids == 2 || chargeCheck)
          {
              float div;
              int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1