Fix AMD OpenCL float3 array optimization bug
authorSzilárd Páll <pall.szilard@gmail.com>
Thu, 6 Feb 2020 23:40:12 +0000 (00:40 +0100)
committerArtem Zhmurov <zhmurov@gmail.com>
Fri, 30 Oct 2020 05:45:43 +0000 (05:45 +0000)
Because float3 by OpenCL spec is 16-byte, when used as an array type
the allocation needs to optimized to avoid unnecessary register use.
The nbnxm kernels use a float3 i-force accumulator array in registers.

Starting with ROCm 2.3 the AMD OpenCL compiler regressed and lost
its ability to effectively optimize code that uses float3 register
arrays. The large amount of extra registers used limits the kernel
occupancy and significantly impacts performance.
Only the AMD platform is affected, other vendors' compilers are able to
do the necessary transformations to avoid the extra register use.

This change converts the float3 array to a float[3] saving 8*4 bytes
register space. This improves nonbonded kernel performance
on an AMD Vega GPU by 25% and 40% for the most common flavor of the
Ewald and RF force-only kernels, respectively.

Note that eliminating the rest of the non-array use of float3 has no
significant impact.

src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

index 60242b0ba87fb3fb6e494d67d37eb6f62ba9f119..11516fd4f1caf4214b93f3b5a288d20332e5e1a2 100644 (file)
@@ -254,10 +254,12 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #endif
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    float3 fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
+    fvec fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
     for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
     {
-        fci_buf[ci_offset] = (float3)(0.0F);
+        fci_buf[ci_offset][0] = 0.0F;
+        fci_buf[ci_offset][1] = 0.0F;
+        fci_buf[ci_offset][2] = 0.0F;
     }
 
 #ifdef LJ_EWALD
@@ -554,7 +556,9 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                 fcj_buf -= f_ij;
 
                                 /* accumulate i forces in registers */
-                                fci_buf[i] += f_ij;
+                                fci_buf[i][0] += f_ij.x;
+                                fci_buf[i][1] += f_ij.y;
+                                fci_buf[i][2] += f_ij.z;
                             }
                         }
 
index e8702e7c74ba2c3ea55469232c739a989efe3ee6..1b7ed8478abe06464a0c4da06403cde0646d46fc 100644 (file)
 #        define gmx_unused
 #    endif
 
+typedef float fvec[3];
+
 // Data structures shared between OpenCL device code and OpenCL host code
 // TODO: review, improve
 // Replaced real by float for now, to avoid including any other header
@@ -669,7 +671,7 @@ gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf,
 }
 
 #    if REDUCE_SHUFFLE
-gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
+gmx_opencl_inline void reduce_force_i_and_shift_shfl(__private fvec  fci_buf[],
                                                      __global float* fout,
                                                      bool            bCalcFshift,
                                                      int             tidxi,
@@ -684,7 +686,7 @@ gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
     for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
     {
         int    aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
-        float3 fin  = fci_buf[ci_offset];
+        float3 fin  = (float3) (fci_buf[ci_offset][0], fci_buf[ci_offset][1], fci_buf[ci_offset][2]);
         fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
         fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
         fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
@@ -727,7 +729,7 @@ gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
  *  array sizes.
  */
 gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf,
-                                                     float3*                 fci_buf,
+                                                     __private fvec          fci_buf[],
                                                      __global float*         fout,
                                                      bool                    bCalcFshift,
                                                      int                     tidxi,
@@ -742,9 +744,9 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b
         int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
         int tidx = tidxi + tidxj * CL_SIZE;
         /* store i forces in shmem */
-        f_buf[tidx]                   = fci_buf[ci_offset].x;
-        f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset].y;
-        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        f_buf[tidx]                   = fci_buf[ci_offset][0];
+        f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset][1];
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset][2];
         barrier(CLK_LOCAL_MEM_FENCE);
 
         /* Reduce the initial CL_SIZE values for each i atom to half
@@ -806,7 +808,7 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b
 /*! Final i-force reduction
  */
 gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf,
-                                                float3*                   fci_buf,
+                                                __private fvec            fci_buf[],
                                                 __global float*           f,
                                                 bool                      bCalcFshift,
                                                 int                       tidxi,