Fix AMD OpenCL float3 array optimization bug

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 6 Feb 2020 23:40:12 +0000 (00:40 +0100)

committer Artem Zhmurov <zhmurov@gmail.com>

Fri, 30 Oct 2020 05:45:43 +0000 (05:45 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 6 Feb 2020 23:40:12 +0000 (00:40 +0100)
committer Artem Zhmurov <zhmurov@gmail.com>
Fri, 30 Oct 2020 05:45:43 +0000 (05:45 +0000)
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh

index 60242b0ba87fb3fb6e494d67d37eb6f62ba9f119..11516fd4f1caf4214b93f3b5a288d20332e5e1a2 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
@@ -254,10 +254,12 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    float3 fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
+    fvec fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
      for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
      {
-        fci_buf[ci_offset] = (float3)(0.0F);
+        fci_buf[ci_offset][0] = 0.0F;
+        fci_buf[ci_offset][1] = 0.0F;
+        fci_buf[ci_offset][2] = 0.0F;
      }
  
  #ifdef LJ_EWALD
@@ -554,7 +556,9 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                  fcj_buf -= f_ij;
  
                                  /* accumulate i forces in registers */
-                                fci_buf[i] += f_ij;
+                                fci_buf[i][0] += f_ij.x;
+                                fci_buf[i][1] += f_ij.y;
+                                fci_buf[i][2] += f_ij.z;
                              }
                          }
  
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

index e8702e7c74ba2c3ea55469232c739a989efe3ee6..1b7ed8478abe06464a0c4da06403cde0646d46fc 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
@@ -122,6 +122,8 @@
  #        define gmx_unused
  #    endif
  
+typedef float fvec[3];
+
  // Data structures shared between OpenCL device code and OpenCL host code
  // TODO: review, improve
  // Replaced real by float for now, to avoid including any other header
@@ -669,7 +671,7 @@ gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf,
  }
  
  #    if REDUCE_SHUFFLE
-gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
+gmx_opencl_inline void reduce_force_i_and_shift_shfl(__private fvec  fci_buf[],
                                                       __global float* fout,
                                                       bool            bCalcFshift,
                                                       int             tidxi,
@@ -684,7 +686,7 @@ gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
      for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
      {
          int    aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
-        float3 fin  = fci_buf[ci_offset];
+        float3 fin  = (float3) (fci_buf[ci_offset][0], fci_buf[ci_offset][1], fci_buf[ci_offset][2]);
          fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
          fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
          fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
@@ -727,7 +729,7 @@ gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
   *  array sizes.
   */
  gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf,
-                                                     float3*                 fci_buf,
+                                                     __private fvec          fci_buf[],
                                                       __global float*         fout,
                                                       bool                    bCalcFshift,
                                                       int                     tidxi,
@@ -742,9 +744,9 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b
          int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
          int tidx = tidxi + tidxj * CL_SIZE;
          /* store i forces in shmem */
-        f_buf[tidx]                   = fci_buf[ci_offset].x;
-        f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset].y;
-        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        f_buf[tidx]                   = fci_buf[ci_offset][0];
+        f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset][1];
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset][2];
          barrier(CLK_LOCAL_MEM_FENCE);
  
          /* Reduce the initial CL_SIZE values for each i atom to half
@@ -806,7 +808,7 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b
  /*! Final i-force reduction
   */
  gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf,
-                                                float3*                   fci_buf,
+                                                __private fvec            fci_buf[],
                                                  __global float*           f,
                                                  bool                      bCalcFshift,
                                                  int                       tidxi,
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 6 Feb 2020 23:40:12 +0000 (00:40 +0100)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Fri, 30 Oct 2020 05:45:43 +0000 (05:45 +0000)
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh		patch \| blob \| history