prepareGpuKernelArguments() and launchGpuKernel() are added
[alexxy/gromacs.git] / src / gromacs / ewald / pme-gather.cu
index 7dc096fc2415c1d7f4a212dbdcb7bab03f589640..71d1687926a63104abab09ed1d4c3b943f2b1e99 100644 (file)
@@ -428,7 +428,6 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
         pme_gpu_copy_input_forces(pmeGpu);
     }
 
-    cudaStream_t stream          = pmeGpu->archSpecific->pmeStream;
     const int    order           = pmeGpu->common->pme_order;
     const auto  *kernelParamsPtr = pmeGpu->kernelParams.get();
 
@@ -445,35 +444,38 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
     const int atomsPerBlock  =  (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
     GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
 
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
-    auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
-    dim3 dimBlock(order, order, atomsPerBlock);
+    const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
-    const bool wrapX = true;
-    const bool wrapY = true;
+    KernelLaunchConfig config;
+    config.blockSize[0] = config.blockSize[1] = order;
+    config.blockSize[2] = atomsPerBlock;
+    config.gridSize[0]  = dimGrid.x;
+    config.gridSize[1]  = dimGrid.y;
+    config.stream       = pmeGpu->archSpecific->pmeStream;
+
+    if (order != 4)
+    {
+        GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
+    }
+
+    constexpr bool wrapX = true;
+    constexpr bool wrapY = true;
     GMX_UNUSED_VALUE(wrapX);
     GMX_UNUSED_VALUE(wrapY);
 
     // TODO test different cache configs
 
-    pme_gpu_start_timing(pmeGpu, gtPME_GATHER);
-    if (order == 4)
-    {
-        if (forceTreatment == PmeForceOutputHandling::Set)
-        {
-            pme_gather_kernel<4, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
-        }
-        else
-        {
-            pme_gather_kernel<4, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
-        }
-    }
-    else
-    {
-        GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 is not implemented"));
-    }
-    CU_LAUNCH_ERR("pme_gather_kernel");
-    pme_gpu_stop_timing(pmeGpu, gtPME_GATHER);
+    int  timingId = gtPME_GATHER;
+    void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ?
+        pme_gather_kernel<4, true, wrapX, wrapY> :
+        pme_gather_kernel<4, false, wrapX, wrapY>;
+
+    pme_gpu_start_timing(pmeGpu, timingId);
+    auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+    const auto kernelArgs  = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+    launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
+    pme_gpu_stop_timing(pmeGpu, timingId);
 
     pme_gpu_copy_output_forces(pmeGpu);
 }