Clean up ewald module internals

[alexxy/gromacs.git] / src / gromacs / ewald / pme_gpu_internal.cpp
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index a7d6e1e96309ff60e7ab1ce1c4381fa78660214f..39973556be36e960afd6b7bced387cada1b24ca7 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -231,7 +231,7 @@ void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
                           pmeGpu->settings.transferKind, nullptr);
  }
  
-void pme_gpu_realloc_coordinates(const PmeGpu* pmeGpu)
+void pme_gpu_realloc_coordinates(PmeGpu* pmeGpu)
  {
      const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
      GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
@@ -255,7 +255,7 @@ void pme_gpu_free_coordinates(const PmeGpu* pmeGpu)
      freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
  }
  
-void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu, const float* h_coefficients)
+void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients)
  {
      GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
      const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
@@ -587,7 +587,7 @@ void pme_gpu_destroy_specific(const PmeGpu* pmeGpu)
  
  void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
  {
-    if (pme_gpu_performs_FFT(pmeGpu))
+    if (pme_gpu_settings(pmeGpu).performGPUFFT)
      {
          pmeGpu->archSpecific->fftSetup.resize(0);
          for (int i = 0; i < pmeGpu->common->ngrids; i++)
@@ -694,7 +694,7 @@ PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const int flags)
      // on the else branch
      if (haveComputedEnergyAndVirial)
      {
-        if (pme_gpu_performs_solve(pmeGpu))
+        if (pme_gpu_settings(pmeGpu).performGPUSolve)
          {
              pme_gpu_getEnergyAndVirial(pme, &output);
          }
@@ -755,7 +755,7 @@ static void pme_gpu_reinit_grids(PmeGpu* pmeGpu)
          kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
      }
      /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
-    if (!pme_gpu_performs_FFT(pmeGpu))
+    if (!pme_gpu_settings(pmeGpu).performGPUFFT)
      {
          // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
          kernelParamsPtr->grid.realGridSizePadded[ZZ] =
@@ -846,7 +846,7 @@ static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeG
   * \param[in,out] gpuInfo        The GPU information structure.
   * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
   */
-static void pme_gpu_init(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProgramHandle pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, const PmeGpuProgram* pmeGpuProgram)
  {
      pme->gpu       = new PmeGpu();
      PmeGpu* pmeGpu = pme->gpu;
@@ -855,7 +855,7 @@ static void pme_gpu_init(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGp
  
      /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
      /* A convenience variable. */
-    pmeGpu->settings.useDecomposition = (pme->nnodes == 1);
+    pmeGpu->settings.useDecomposition = (pme->nnodes != 1);
      /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
      pmeGpu->settings.performGPUGather = true;
      // By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
@@ -954,10 +954,12 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx:
      }
  }
  
-void pme_gpu_reinit(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProgramHandle pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, const PmeGpuProgram* pmeGpuProgram)
  {
-    if (!pme_gpu_active(pme))
+    GMX_ASSERT(pme != nullptr, "Need valid PME object");
+    if (pme->runMode == PmeRunMode::CPU)
      {
+        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
          return;
      }
  
@@ -973,7 +975,7 @@ void pme_gpu_reinit(gmx_pme_t* pme, const gmx_device_info_t* gpuInfo, PmeGpuProg
      }
      /* GPU FFT will only get used for a single rank.*/
      pme->gpu->settings.performGPUFFT =
-            (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu);
+            (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme->gpu->settings.useDecomposition;
      pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
  
      /* Reinit active timers */
@@ -1037,6 +1039,25 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
      }
  }
  
+/*! \internal \brief
+ * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
+ * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
+ *
+ * \param[in] pmeGpu         The PME GPU data structure.
+ * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PMEStageId)
+{
+    CommandEvent* timingEvent = nullptr;
+    if (pme_gpu_timings_enabled(pmeGpu))
+    {
+        GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
+                   "Wrong PME GPU timing event index");
+        timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
+    }
+    return timingEvent;
+}
+
  void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index)
  {
      int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
@@ -1206,7 +1227,7 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
      // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
      // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
      GMX_ASSERT(xReadyOnDevice != nullptr || (GMX_GPU != GMX_GPU_CUDA)
-                       || pmeGpu->common->isRankPmeOnly || pme_gpu_is_testing(pmeGpu),
+                       || pmeGpu->common->isRankPmeOnly || pme_gpu_settings(pmeGpu).copyAllOutputs,
                 "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
      if (xReadyOnDevice)
      {
@@ -1265,14 +1286,14 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
      launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
      pme_gpu_stop_timing(pmeGpu, timingId);
  
-    const bool copyBackGrid =
-            spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
+    const auto& settings    = pmeGpu->settings;
+    const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs);
      if (copyBackGrid)
      {
          pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
      }
      const bool copyBackAtomData =
-            computeSplines && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_gather(pmeGpu));
+            computeSplines && (!settings.performGPUGather || settings.copyAllOutputs);
      if (copyBackAtomData)
      {
          pme_gpu_copy_output_spread_atom_data(pmeGpu);
@@ -1281,7 +1302,8 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
  
  void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
  {
-    const bool copyInputAndOutputGrid = pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu);
+    const auto& settings               = pmeGpu->settings;
+    const bool  copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs;
  
      auto* kernelParamsPtr = pmeGpu->kernelParams.get();
  
@@ -1441,12 +1463,13 @@ void pme_gpu_gather(PmeGpu* pmeGpu, PmeForceOutputHandling forceTreatment, const
          pme_gpu_copy_input_forces(pmeGpu);
      }
  
-    if (!pme_gpu_performs_FFT(pmeGpu) || pme_gpu_is_testing(pmeGpu))
+    const auto& settings = pmeGpu->settings;
+    if (!settings.performGPUFFT || settings.copyAllOutputs)
      {
          pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float*>(h_grid));
      }
  
-    if (pme_gpu_is_testing(pmeGpu))
+    if (settings.copyAllOutputs)
      {
          pme_gpu_copy_input_gather_atom_data(pmeGpu);
      }