SYCL: Reduce the number of atomic ops in NBNXM fShift calculation
authorAndrey Alekseenko <al42and@gmail.com>
Thu, 17 Jun 2021 13:34:35 +0000 (13:34 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Thu, 17 Jun 2021 13:34:35 +0000 (13:34 +0000)
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp

index 1a37f49dfff3969b17e9d9046152fd11518066e0..c5de5de121b4696917f3ee7304f5ec1febded4f4 100644 (file)
@@ -479,7 +479,26 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read
            storing the reduction result above. */
         if (tidxj < 3)
         {
-            atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+            if constexpr (c_clSize == 4)
+            {
+                /* Intel Xe (Gen12LP) and earlier GPUs implement floating-point atomics via
+                 * a compare-and-swap (CAS) loop. It has particularly poor performance when
+                 * updating the same memory location from the same work-group.
+                 * Such optimization might be slightly beneficial for NVIDIA and AMD as well,
+                 * but it is unlikely to make a big difference and thus was not evaluated.
+                 */
+                auto sg = itemIdx.get_sub_group();
+                fShiftBuf += sycl_2020::shift_left(sg, fShiftBuf, 1);
+                fShiftBuf += sycl_2020::shift_left(sg, fShiftBuf, 2);
+                if (tidxi == 0)
+                {
+                    atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+                }
+            }
+            else
+            {
+                atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+            }
         }
     }
 }