Merge branch release-2020 into master
[alexxy/gromacs.git] / src / gromacs / ewald / pme_gpu_internal.cpp
index e2c3059774913e524b85a85625a47245bfd991d8..255f273fde9101bfb890eed131281d61ce8b06f2 100644 (file)
@@ -231,31 +231,7 @@ void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
                          pmeGpu->settings.transferKind, nullptr);
 }
 
-void pme_gpu_realloc_coordinates(const PmeGpu* pmeGpu)
-{
-    const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
-    GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
-                           &pmeGpu->archSpecific->coordinatesSize,
-                           &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->context);
-    if (c_usePadding)
-    {
-        const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
-        const size_t paddingCount = DIM * pmeGpu->nAtomsAlloc - paddingIndex;
-        if (paddingCount > 0)
-        {
-            clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coordinates, paddingIndex,
-                                   paddingCount, pmeGpu->archSpecific->pmeStream);
-        }
-    }
-}
-
-void pme_gpu_free_coordinates(const PmeGpu* pmeGpu)
-{
-    freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
-}
-
-void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu, const float* h_coefficients)
+void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients)
 {
     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
@@ -559,7 +535,7 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
 #elif GMX_GPU == GMX_GPU_OPENCL
     cl_command_queue_properties queueProperties =
             pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
-    cl_device_id device_id = pmeGpu->deviceInfo->ocl_gpu_id.ocl_device_id;
+    cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
     cl_int       clError;
     pmeGpu->archSpecific->pmeStream =
             clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError);
@@ -587,7 +563,7 @@ void pme_gpu_destroy_specific(const PmeGpu* pmeGpu)
 
 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
 {
-    if (pme_gpu_performs_FFT(pmeGpu))
+    if (pme_gpu_settings(pmeGpu).performGPUFFT)
     {
         pmeGpu->archSpecific->fftSetup.resize(0);
         for (int i = 0; i < pmeGpu->common->ngrids; i++)
@@ -694,7 +670,7 @@ PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const int flags)
     // on the else branch
     if (haveComputedEnergyAndVirial)
     {
-        if (pme_gpu_performs_solve(pmeGpu))
+        if (pme_gpu_settings(pmeGpu).performGPUSolve)
         {
             pme_gpu_getEnergyAndVirial(pme, &output);
         }
@@ -755,7 +731,7 @@ static void pme_gpu_reinit_grids(PmeGpu* pmeGpu)
         kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
     }
     /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
-    if (!pme_gpu_performs_FFT(pmeGpu))
+    if (!pme_gpu_settings(pmeGpu).performGPUFFT)
     {
         // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
         kernelParamsPtr->grid.realGridSizePadded[ZZ] =
@@ -843,10 +819,10 @@ static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeG
  * TODO: this should become PmeGpu::PmeGpu()
  *
  * \param[in,out] pme            The PME structure.
- * \param[in,out] gpuInfo        The GPU information structure.
+ * \param[in,out] deviceInfo     The GPU device information structure.
  * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
  */
-static void pme_gpu_init(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProgramHandle pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
 {
     pme->gpu       = new PmeGpu();
     PmeGpu* pmeGpu = pme->gpu;
@@ -855,7 +831,7 @@ static void pme_gpu_init(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGp
 
     /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
     /* A convenience variable. */
-    pmeGpu->settings.useDecomposition = (pme->nnodes == 1);
+    pmeGpu->settings.useDecomposition = (pme->nnodes != 1);
     /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
     pmeGpu->settings.performGPUGather = true;
     // By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
@@ -863,7 +839,7 @@ static void pme_gpu_init(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGp
 
     pme_gpu_set_testing(pmeGpu, false);
 
-    pmeGpu->deviceInfo = gpuInfo;
+    pmeGpu->deviceInfo = deviceInfo;
     GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
     pmeGpu->programHandle_ = pmeGpuProgram;
 
@@ -954,17 +930,19 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx:
     }
 }
 
-void pme_gpu_reinit(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProgramHandle pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
 {
-    if (!pme_gpu_active(pme))
+    GMX_ASSERT(pme != nullptr, "Need valid PME object");
+    if (pme->runMode == PmeRunMode::CPU)
     {
+        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
         return;
     }
 
     if (!pme->gpu)
     {
         /* First-time initialization */
-        pme_gpu_init(pme, gpuInfo, pmeGpuProgram);
+        pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
     }
     else
     {
@@ -973,7 +951,7 @@ void pme_gpu_reinit(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProg
     }
     /* GPU FFT will only get used for a single rank.*/
     pme->gpu->settings.performGPUFFT =
-            (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu);
+            (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme->gpu->settings.useDecomposition;
     pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
 
     /* Reinit active timers */
@@ -1037,6 +1015,25 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
     pme_gpu_select_best_performing_pme_spreadgather_kernels(pmeGpu);
 }
 
+/*! \internal \brief
+ * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
+ * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
+ *
+ * \param[in] pmeGpu         The PME GPU data structure.
+ * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PMEStageId)
+{
+    CommandEvent* timingEvent = nullptr;
+    if (pme_gpu_timings_enabled(pmeGpu))
+    {
+        GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
+                   "Wrong PME GPU timing event index");
+        timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
+    }
+    return timingEvent;
+}
+
 void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index)
 {
     int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
@@ -1206,7 +1203,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
     // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
     GMX_ASSERT(xReadyOnDevice != nullptr || (GMX_GPU != GMX_GPU_CUDA)
-                       || pmeGpu->common->isRankPmeOnly || pme_gpu_is_testing(pmeGpu),
+                       || pmeGpu->common->isRankPmeOnly || pme_gpu_settings(pmeGpu).copyAllOutputs,
                "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
     if (xReadyOnDevice)
     {
@@ -1265,14 +1262,14 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
     launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
     pme_gpu_stop_timing(pmeGpu, timingId);
 
-    const bool copyBackGrid =
-            spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
+    const auto& settings    = pmeGpu->settings;
+    const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs);
     if (copyBackGrid)
     {
         pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
     }
     const bool copyBackAtomData =
-            computeSplines && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_gather(pmeGpu));
+            computeSplines && (!settings.performGPUGather || settings.copyAllOutputs);
     if (copyBackAtomData)
     {
         pme_gpu_copy_output_spread_atom_data(pmeGpu);
@@ -1281,7 +1278,8 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
 
 void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
 {
-    const bool copyInputAndOutputGrid = pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu);
+    const auto& settings               = pmeGpu->settings;
+    const bool  copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs;
 
     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
 
@@ -1387,14 +1385,10 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
  * \param[in]  pmeGpu                   The PME GPU structure.
  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
  * \param[in]  readSplinesFromGlobal    bool controlling if we should write spline data to global memory
- * \param[in]  forceTreatment           Controls if the forces from the gather should increment or replace the input forces.
  *
  * \return Pointer to CUDA kernel
  */
-inline auto selectGatherKernelPtr(const PmeGpu*          pmeGpu,
-                                  bool                   useOrderThreadsPerAtom,
-                                  bool                   readSplinesFromGlobal,
-                                  PmeForceOutputHandling forceTreatment)
+inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool readSplinesFromGlobal)
 
 {
     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
@@ -1403,50 +1397,37 @@ inline auto selectGatherKernelPtr(const PmeGpu*          pmeGpu,
     {
         if (useOrderThreadsPerAtom)
         {
-            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
-                                ? pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4
-                                : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplinesThPerAtom4;
+            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4;
         }
         else
         {
-            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
-                                ? pmeGpu->programHandle_->impl_->gatherKernelReadSplines
-                                : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplines;
+            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplines;
         }
     }
     else
     {
         if (useOrderThreadsPerAtom)
         {
-            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
-                                ? pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4
-                                : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelThPerAtom4;
+            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4;
         }
         else
         {
-            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set)
-                                ? pmeGpu->programHandle_->impl_->gatherKernel
-                                : pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
+            kernelPtr = pmeGpu->programHandle_->impl_->gatherKernel;
         }
     }
     return kernelPtr;
 }
 
 
-void pme_gpu_gather(PmeGpu* pmeGpu, PmeForceOutputHandling forceTreatment, const float* h_grid)
+void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
 {
-    /* Copying the input CPU forces for reduction */
-    if (forceTreatment != PmeForceOutputHandling::Set)
-    {
-        pme_gpu_copy_input_forces(pmeGpu);
-    }
-
-    if (!pme_gpu_performs_FFT(pmeGpu) || pme_gpu_is_testing(pmeGpu))
+    const auto& settings = pmeGpu->settings;
+    if (!settings.performGPUFFT || settings.copyAllOutputs)
     {
         pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float*>(h_grid));
     }
 
-    if (pme_gpu_is_testing(pmeGpu))
+    if (settings.copyAllOutputs)
     {
         pme_gpu_copy_input_gather_atom_data(pmeGpu);
     }
@@ -1482,9 +1463,9 @@ void pme_gpu_gather(PmeGpu* pmeGpu, PmeForceOutputHandling forceTreatment, const
 
     // TODO test different cache configs
 
-    int                                timingId  = gtPME_GATHER;
-    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr(
-            pmeGpu, useOrderThreadsPerAtom, readGlobal || (!recalculateSplines), forceTreatment);
+    int                                timingId = gtPME_GATHER;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
+            selectGatherKernelPtr(pmeGpu, useOrderThreadsPerAtom, readGlobal || (!recalculateSplines));
     // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
 
     pme_gpu_start_timing(pmeGpu, timingId);
@@ -1512,15 +1493,6 @@ void pme_gpu_gather(PmeGpu* pmeGpu, PmeForceOutputHandling forceTreatment, const
     }
 }
 
-DeviceBuffer<float> pme_gpu_get_kernelparam_coordinates(const PmeGpu* pmeGpu)
-{
-    GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
-               "PME GPU device buffer was requested in non-GPU build or before the GPU PME was "
-               "initialized.");
-
-    return pmeGpu->kernelParams->atoms.d_coordinates;
-}
-
 void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
@@ -1533,38 +1505,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
     }
 }
 
-/*! \brief Check the validity of the device buffer.
- *
- * Checks if the buffer is not nullptr and, when possible, if it is big enough.
- *
- * \todo Split and move this function to gpu_utils.
- *
- * \param[in] buffer        Device buffer to be checked.
- * \param[in] requiredSize  Number of elements that the buffer will have to accommodate.
- *
- * \returns If the device buffer can be set.
- */
-template<typename T>
-static bool checkDeviceBuffer(gmx_unused DeviceBuffer<T> buffer, gmx_unused int requiredSize)
-{
-#if GMX_GPU == GMX_GPU_CUDA
-    GMX_ASSERT(buffer != nullptr, "The device pointer is nullptr");
-    return buffer != nullptr;
-#elif GMX_GPU == GMX_GPU_OPENCL
-    size_t size;
-    int    retval = clGetMemObjectInfo(buffer, CL_MEM_SIZE, sizeof(size), &size, nullptr);
-    GMX_ASSERT(retval == CL_SUCCESS,
-               gmx::formatString("clGetMemObjectInfo failed with error code #%d", retval).c_str());
-    GMX_ASSERT(static_cast<int>(size) >= requiredSize,
-               "Number of atoms in device buffer is smaller then required size.");
-    return retval == CL_SUCCESS && static_cast<int>(size) >= requiredSize;
-#elif GMX_GPU == GMX_GPU_NONE
-    GMX_ASSERT(false, "Setter for device-side coordinates was called in non-GPU build.");
-    return false;
-#endif
-}
-
-void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<float> d_x)
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
 {
     GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
                "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "