pmeGpu->settings.transferKind, nullptr);
}
-void pme_gpu_realloc_coordinates(const PmeGpu* pmeGpu)
-{
- const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
- GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
- reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
- &pmeGpu->archSpecific->coordinatesSize,
- &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->context);
- if (c_usePadding)
- {
- const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
- const size_t paddingCount = DIM * pmeGpu->nAtomsAlloc - paddingIndex;
- if (paddingCount > 0)
- {
- clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coordinates, paddingIndex,
- paddingCount, pmeGpu->archSpecific->pmeStream);
- }
- }
-}
-
-void pme_gpu_free_coordinates(const PmeGpu* pmeGpu)
-{
- freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
-}
-
-void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu, const float* h_coefficients)
+void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients)
{
GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
#elif GMX_GPU == GMX_GPU_OPENCL
cl_command_queue_properties queueProperties =
pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
- cl_device_id device_id = pmeGpu->deviceInfo->ocl_gpu_id.ocl_device_id;
+ cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
cl_int clError;
pmeGpu->archSpecific->pmeStream =
clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError);
void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
{
- if (pme_gpu_performs_FFT(pmeGpu))
+ if (pme_gpu_settings(pmeGpu).performGPUFFT)
{
pmeGpu->archSpecific->fftSetup.resize(0);
for (int i = 0; i < pmeGpu->common->ngrids; i++)
// on the else branch
if (haveComputedEnergyAndVirial)
{
- if (pme_gpu_performs_solve(pmeGpu))
+ if (pme_gpu_settings(pmeGpu).performGPUSolve)
{
pme_gpu_getEnergyAndVirial(pme, &output);
}
kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
}
/* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
- if (!pme_gpu_performs_FFT(pmeGpu))
+ if (!pme_gpu_settings(pmeGpu).performGPUFFT)
{
// This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
kernelParamsPtr->grid.realGridSizePadded[ZZ] =
* TODO: this should become PmeGpu::PmeGpu()
*
* \param[in,out] pme The PME structure.
- * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in,out] deviceInfo The GPU device information structure.
* \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
*/
-static void pme_gpu_init(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProgramHandle pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
{
pme->gpu = new PmeGpu();
PmeGpu* pmeGpu = pme->gpu;
/* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
/* A convenience variable. */
- pmeGpu->settings.useDecomposition = (pme->nnodes == 1);
+ pmeGpu->settings.useDecomposition = (pme->nnodes != 1);
/* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
pmeGpu->settings.performGPUGather = true;
// By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
pme_gpu_set_testing(pmeGpu, false);
- pmeGpu->deviceInfo = gpuInfo;
+ pmeGpu->deviceInfo = deviceInfo;
GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
pmeGpu->programHandle_ = pmeGpuProgram;
}
}
-void pme_gpu_reinit(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProgramHandle pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
{
- if (!pme_gpu_active(pme))
+ GMX_ASSERT(pme != nullptr, "Need valid PME object");
+ if (pme->runMode == PmeRunMode::CPU)
{
+ GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
return;
}
if (!pme->gpu)
{
/* First-time initialization */
- pme_gpu_init(pme, gpuInfo, pmeGpuProgram);
+ pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
}
else
{
}
/* GPU FFT will only get used for a single rank.*/
pme->gpu->settings.performGPUFFT =
- (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu);
+ (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme->gpu->settings.useDecomposition;
pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
/* Reinit active timers */
pme_gpu_select_best_performing_pme_spreadgather_kernels(pmeGpu);
}
+/*! \internal \brief
+ * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
+ * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
+ *
+ * \param[in] pmeGpu The PME GPU data structure.
+ * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PMEStageId)
+{
+ CommandEvent* timingEvent = nullptr;
+ if (pme_gpu_timings_enabled(pmeGpu))
+ {
+ GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
+ "Wrong PME GPU timing event index");
+ timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
+ }
+ return timingEvent;
+}
+
void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index)
{
int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
// only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
// nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
GMX_ASSERT(xReadyOnDevice != nullptr || (GMX_GPU != GMX_GPU_CUDA)
- || pmeGpu->common->isRankPmeOnly || pme_gpu_is_testing(pmeGpu),
+ || pmeGpu->common->isRankPmeOnly || pme_gpu_settings(pmeGpu).copyAllOutputs,
"Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
if (xReadyOnDevice)
{
launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
- const bool copyBackGrid =
- spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
+ const auto& settings = pmeGpu->settings;
+ const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs);
if (copyBackGrid)
{
pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
}
const bool copyBackAtomData =
- computeSplines && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_gather(pmeGpu));
+ computeSplines && (!settings.performGPUGather || settings.copyAllOutputs);
if (copyBackAtomData)
{
pme_gpu_copy_output_spread_atom_data(pmeGpu);
void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
{
- const bool copyInputAndOutputGrid = pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu);
+ const auto& settings = pmeGpu->settings;
+ const bool copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs;
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
* \param[in] pmeGpu The PME GPU structure.
* \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
* \param[in] readSplinesFromGlobal bool controlling if we should write spline data to global memory
- * \param[in] forceTreatment Controls if the forces from the gather should increment or replace the input forces.
*
* \return Pointer to CUDA kernel
*/
-inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu,
- bool useOrderThreadsPerAtom,
- bool readSplinesFromGlobal,
- PmeForceOutputHandling forceTreatment)
+inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool readSplinesFromGlobal)
{
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
{
if (useOrderThreadsPerAtom)
{
- kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
- ? pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4
- : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplinesThPerAtom4;
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4;
}
else
{
- kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
- ? pmeGpu->programHandle_->impl_->gatherKernelReadSplines
- : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplines;
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplines;
}
}
else
{
if (useOrderThreadsPerAtom)
{
- kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
- ? pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4
- : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelThPerAtom4;
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4;
}
else
{
- kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
- ? pmeGpu->programHandle_->impl_->gatherKernel
- : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernel;
}
}
return kernelPtr;
}
-void pme_gpu_gather(PmeGpu* pmeGpu, PmeForceOutputHandling forceTreatment, const float* h_grid)
+void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
{
- /* Copying the input CPU forces for reduction */
- if (forceTreatment != PmeForceOutputHandling::Set)
- {
- pme_gpu_copy_input_forces(pmeGpu);
- }
-
- if (!pme_gpu_performs_FFT(pmeGpu) || pme_gpu_is_testing(pmeGpu))
+ const auto& settings = pmeGpu->settings;
+ if (!settings.performGPUFFT || settings.copyAllOutputs)
{
pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float*>(h_grid));
}
- if (pme_gpu_is_testing(pmeGpu))
+ if (settings.copyAllOutputs)
{
pme_gpu_copy_input_gather_atom_data(pmeGpu);
}
// TODO test different cache configs
- int timingId = gtPME_GATHER;
- PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr(
- pmeGpu, useOrderThreadsPerAtom, readGlobal || (!recalculateSplines), forceTreatment);
+ int timingId = gtPME_GATHER;
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
+ selectGatherKernelPtr(pmeGpu, useOrderThreadsPerAtom, readGlobal || (!recalculateSplines));
// TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
pme_gpu_start_timing(pmeGpu, timingId);
}
}
-DeviceBuffer<float> pme_gpu_get_kernelparam_coordinates(const PmeGpu* pmeGpu)
-{
- GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
- "PME GPU device buffer was requested in non-GPU build or before the GPU PME was "
- "initialized.");
-
- return pmeGpu->kernelParams->atoms.d_coordinates;
-}
-
void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
{
if (pmeGpu && pmeGpu->kernelParams)
}
}
-/*! \brief Check the validity of the device buffer.
- *
- * Checks if the buffer is not nullptr and, when possible, if it is big enough.
- *
- * \todo Split and move this function to gpu_utils.
- *
- * \param[in] buffer Device buffer to be checked.
- * \param[in] requiredSize Number of elements that the buffer will have to accommodate.
- *
- * \returns If the device buffer can be set.
- */
-template<typename T>
-static bool checkDeviceBuffer(gmx_unused DeviceBuffer<T> buffer, gmx_unused int requiredSize)
-{
-#if GMX_GPU == GMX_GPU_CUDA
- GMX_ASSERT(buffer != nullptr, "The device pointer is nullptr");
- return buffer != nullptr;
-#elif GMX_GPU == GMX_GPU_OPENCL
- size_t size;
- int retval = clGetMemObjectInfo(buffer, CL_MEM_SIZE, sizeof(size), &size, nullptr);
- GMX_ASSERT(retval == CL_SUCCESS,
- gmx::formatString("clGetMemObjectInfo failed with error code #%d", retval).c_str());
- GMX_ASSERT(static_cast<int>(size) >= requiredSize,
- "Number of atoms in device buffer is smaller then required size.");
- return retval == CL_SUCCESS && static_cast<int>(size) >= requiredSize;
-#elif GMX_GPU == GMX_GPU_NONE
- GMX_ASSERT(false, "Setter for device-side coordinates was called in non-GPU build.");
- return false;
-#endif
-}
-
-void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<float> d_x)
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
{
GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
"PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "