+ launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+ }
+ // Set dependencies for PME stream on all pipeline streams
+ for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+ {
+ GpuEventSynchronizer event;
+ event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+ }
+ }
+ else // pipelining is not in use
+ {
+ if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+ {
+ pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+ }
+
+#if c_canEmbedBuffers
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+ launchGpuKernel(kernelPtr,
+ config,
+ pmeGpu->archSpecific->pmeStream_,
+ timingEvent,
+ "PME spline/spread",
+ kernelArgs);
+ }
+