PME reduction for CUDA F buffer operations

author Alan Gray <alangray3@gmail.com>

Fri, 10 May 2019 10:28:58 +0000 (03:28 -0700)

committer Szilárd Páll <pall.szilard@gmail.com>

Thu, 15 Aug 2019 14:22:50 +0000 (16:22 +0200)
author Alan Gray <alangray3@gmail.com>
Fri, 10 May 2019 10:28:58 +0000 (03:28 -0700)
committer Szilárd Páll <pall.szilard@gmail.com>
Thu, 15 Aug 2019 14:22:50 +0000 (16:22 +0200)
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 29cc106554ded389f1b7b796e40906157bbcd5fe..a6603901358c7a1e5d485e860ad39ca98fc29a4b 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -73,6 +73,7 @@ struct NumPmeDomains;
  
  enum class GpuTaskCompletion;
  class PmeGpuProgram;
+class GpuEventSynchronizer;
  //! Convenience name.
  using PmeGpuProgramHandle = const PmeGpuProgram *;
  
@@ -382,10 +383,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t       *GPU_F
   * \param[in]  forceTreatment    Tells how data should be treated. The gathering kernel either stores
   *                               the output reciprocal forces into the host array, or copies its contents to the GPU first
   *                               and accumulates. The reduction is non-atomic.
+ * \param[in]  useGpuFPmeReduction  Whether PME forces are reduced on GPU
   */
  GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t        *GPU_FUNC_ARGUMENT(pme),
                                                gmx_wallcycle          *GPU_FUNC_ARGUMENT(wcycle),
-                                              PmeForceOutputHandling  GPU_FUNC_ARGUMENT(forceTreatment)) GPU_FUNC_TERM;
+                                              PmeForceOutputHandling  GPU_FUNC_ARGUMENT(forceTreatment),
+                                              bool                    GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
  
  /*! \brief
   * Attempts to complete PME GPU tasks.
@@ -426,13 +429,15 @@ GPU_FUNC_QUALIFIER bool
   * \param[in]  wcycle          The wallclock counter.
   * \param[out] forceWithVirial The output force and virial
   * \param[out] enerd           The output energies
+ * \param[in]  useGpuFPmeReduction  Whether PME forces are reduced on GPU
   */
  GPU_FUNC_QUALIFIER void
      pme_gpu_wait_and_reduce(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
                              int                   GPU_FUNC_ARGUMENT(flags),
                              gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle),
                              gmx::ForceWithVirial *GPU_FUNC_ARGUMENT(forceWithVirial),
-                            gmx_enerdata_t       *GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM;
+                            gmx_enerdata_t       *GPU_FUNC_ARGUMENT(enerd),
+                            bool                  GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
  
  /*! \brief
   * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
@@ -451,6 +456,22 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARG
                                                     gmx_wallcycle   *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
  
  
-/*! \brief Get pointer to device copy of coordinate data. */
+/*! \brief Get pointer to device copy of coordinate data.
+ * \param[in] pme            The PME data structure.
+ * \returns                  Pointer to coordinate data
+ */
  GPU_FUNC_QUALIFIER void *pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Get pointer to device copy of force data.
+ * \param[in] pme            The PME data structure.
+ * \returns                  Pointer to force data
+ */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_f(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
+ * \param[in] pme            The PME data structure.
+ * \returns                  Pointer to sychronizer
+ */
+GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
  #endif
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 07d15cdf1a55130b6eb9e53ec6992874503ab443..cd1ba956c428f55c8cbde3120a9d612b6357830d 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -249,7 +249,8 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
  
  void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
                             gmx_wallcycle gmx_unused        *wcycle,
-                           PmeForceOutputHandling           forceTreatment)
+                           PmeForceOutputHandling           forceTreatment,
+                           bool                             useGpuFPmeReduction)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
  
@@ -262,7 +263,7 @@ void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
      wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
      const unsigned int gridIndex  = 0;
      real              *fftgrid    = pme->fftgrid[gridIndex];
-    pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
+    pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid), useGpuFPmeReduction);
      wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
      wallcycle_stop(wcycle, ewcLAUNCH_GPU);
  }
@@ -286,7 +287,8 @@ static void pme_gpu_reduce_outputs(const int             flags,
                                     const PmeOutput      &output,
                                     gmx_wallcycle        *wcycle,
                                     gmx::ForceWithVirial *forceWithVirial,
-                                   gmx_enerdata_t       *enerd)
+                                   gmx_enerdata_t       *enerd,
+                                   bool                  useGpuFPmeReduction)
  {
      wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
      GMX_ASSERT(forceWithVirial, "Invalid force pointer");
@@ -297,7 +299,10 @@ static void pme_gpu_reduce_outputs(const int             flags,
          forceWithVirial->addVirialContribution(output.coulombVirial_);
          enerd->term[F_COUL_RECIP] += output.coulombEnergy_;
      }
-    sum_forces(forceWithVirial->force_, output.forces_);
+    if (!useGpuFPmeReduction)
+    {
+        sum_forces(forceWithVirial->force_, output.forces_);
+    }
      wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
  }
  
@@ -344,7 +349,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t            *pme,
      PmeOutput output = pme_gpu_getOutput(*pme, flags);
      wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
  
-    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd);
+    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, false);
  
      return true;
  }
@@ -357,7 +362,9 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t     *pme,
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
  
      wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+
      // Synchronize the whole PME stream at once, including D2H result transfers.
+    // TODO: make this sync conditional with useGpuFPmeReduction to wait only for virial/energies
      pme_gpu_synchronize(pme->gpu);
  
      pme_gpu_update_timings(pme->gpu);
@@ -371,10 +378,11 @@ void pme_gpu_wait_and_reduce(gmx_pme_t            *pme,
                               const int             flags,
                               gmx_wallcycle        *wcycle,
                               gmx::ForceWithVirial *forceWithVirial,
-                             gmx_enerdata_t       *enerd)
+                             gmx_enerdata_t       *enerd,
+                             bool                  useGpuFPmeReduction)
  {
      PmeOutput output = pme_gpu_wait_finish_task(pme, flags, wcycle);
-    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd);
+    pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, useGpuFPmeReduction);
  }
  
  void pme_gpu_reinit_computation(const gmx_pme_t *pme,
@@ -400,3 +408,22 @@ void *pme_gpu_get_device_x(const gmx_pme_t *pme)
      }
      return pme_gpu_get_kernelparam_coordinates(pme->gpu);
  }
+
+void *pme_gpu_get_device_f(const gmx_pme_t *pme)
+{
+    if (!pme || !pme_gpu_active(pme))
+    {
+        return nullptr;
+    }
+    return pme_gpu_get_kernelparam_forces(pme->gpu);
+}
+
+GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme)
+{
+    if (!pme || !pme_gpu_active(pme))
+    {
+        return nullptr;
+    }
+
+    return pme_gpu_get_forces_ready_synchronizer(pme->gpu);
+}
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index cad028b557775811ad759173ffe303f5306f3a18..70c74c900ce146a35949cde6483407790e22b472 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -239,7 +239,6 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordina
      // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
      //        Remove after adding the required event-based sync between the above H2D and the transform kernel
      pme_gpu_synchronize(pmeGpu);
-
  #endif
  }
  
@@ -1207,7 +1206,8 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
  
  void pme_gpu_gather(PmeGpu                *pmeGpu,
                      PmeForceOutputHandling forceTreatment,
-                    const float           *h_grid
+                    const float           *h_grid,
+                    bool                   useGpuFPmeReduction
                      )
  {
      /* Copying the input CPU forces for reduction */
@@ -1269,7 +1269,14 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
      launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
      pme_gpu_stop_timing(pmeGpu, timingId);
  
-    pme_gpu_copy_output_forces(pmeGpu);
+    if (useGpuFPmeReduction)
+    {
+        pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
+    }
+    else
+    {
+        pme_gpu_copy_output_forces(pmeGpu);
+    }
  }
  
  void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu)
@@ -1282,5 +1289,28 @@ void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu)
      {
          return nullptr;
      }
+}
  
+void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu)
+{
+    if (pmeGpu && pmeGpu->kernelParams)
+    {
+        return pmeGpu->kernelParams->atoms.d_forces;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu)
+{
+    if (pmeGpu && pmeGpu->kernelParams)
+    {
+        return &pmeGpu->archSpecific->pmeForcesReady;
+    }
+    else
+    {
+        return nullptr;
+    }
  }
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index e22fe645928d5555b43087003e71c987920dd866..19d243af41ef6377d33304ab2b9463206ee04df6 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -437,14 +437,31 @@ GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *GPU_FUNC_ARGUMENT(pmeGpu)
   * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
   *                                 TODO: determine efficiency/balance of host/device-side reductions.
   * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
+ * \param[in]     useGpuFPmeReduction Whether forces are reduced on GPU
   */
  GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *GPU_FUNC_ARGUMENT(pmeGpu),
                                         PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment),
-                                       const float           *GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
+                                       const float           *GPU_FUNC_ARGUMENT(h_grid),
+                                       bool                   GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
  
-/*! \brief Return pointer to device copy of coordinate data. */
+/*! \brief Return pointer to device copy of coordinate data.
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to coordinate data
+ */
  GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
+/*! \brief Return pointer to device copy of force data.
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to force data
+ */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to sync object
+ */
+GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
  /* The inlined convenience PME GPU status getters */
  
  /*! \libinternal \brief
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h

index 6949bde60f66deaeff8e889ab6b9299adb103fdc..be865c87787fed9609fdd37ebebc6f1fbd7c7aee 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -79,6 +79,8 @@ struct PmeGpuSpecific
      Context context;
  
      /* Synchronization events */
+    /*! \brief Triggered after the PME Force Calculations have been completed */
+    GpuEventSynchronizer pmeForcesReady;
      /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
      GpuEventSynchronizer syncSpreadGridD2H;
  
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 6f8ebe1723e80eb7cca84ddbfeada3a23a7d9064..e3e948a856ac6edac000bf8e4958ac044f8529d2 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -626,7 +626,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
              pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags);
              pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle);
-            pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
+            pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set, false);
              output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
              pme_gpu_reinit_computation(pme, wcycle);
          }
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index 55515294d2816d13629553745c8e4ff748fd7737..763d79d6823620828a8e6dda94fd89bc653fd343 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -418,7 +418,7 @@ void pmePerformGather(gmx_pme_t *pme, CodePath mode,
              {
                  std::copy(std::begin(forces), std::end(forces), std::begin(output.forces_));
              }
-            pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid));
+            pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid), false);
              std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
          }
          break;
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 11963c0de663dfe3421d2310d734d26c1e3a3637..0c53b40c4dd3abed1ecc796997183b84d5265cab 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1,4 +1,4 @@
-/*
+/* x
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
@@ -623,12 +623,14 @@ static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
   *
   * \param[in]  pmedata        The PME structure
   * \param[in]  wcycle         The wallcycle structure
+ * \param[in]  useGpuFPmeReduction Whether forces will be reduced on GPU
   */
  static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
-                                     gmx_wallcycle_t   wcycle)
+                                     gmx_wallcycle_t   wcycle,
+                                     bool              useGpuFPmeReduction)
  {
      pme_gpu_launch_complex_transforms(pmedata, wcycle);
-    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
+    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set, useGpuFPmeReduction);
  }
  
  /*! \brief
@@ -695,9 +697,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                  wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
                  nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                              as_rvec_array(force->unpaddedArrayRef().data()),
-                                              BufferOpsUseGpu::False,
-                                              GpuBufferOpsAccumulateForce::Null);
+                                              as_rvec_array(force->unpaddedArrayRef().data()));
              }
          }
      }
@@ -907,12 +907,18 @@ void do_force(FILE                                     *fplog,
          ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
          ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
          ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
-    const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
-        && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
-        BufferOpsUseGpu::True : BufferOpsUseGpu::False;
  
+    // Switches on whether to use GPU for position and force buffer operations
+    // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
      const BufferOpsUseGpu useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) ?
          BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
+    // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
+    const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
+        && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
+        BufferOpsUseGpu::True : BufferOpsUseGpu::False;
+    // TODO: move / add this flag to the internal PME GPU data structures
+    const bool useGpuFPmeReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+        thisRankHasDuty(cr, DUTY_PME) && useGpuPme; // only supported if this rank is perfoming PME on the GPU
  
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
@@ -1141,7 +1147,7 @@ void do_force(FILE                                     *fplog,
          // X copy/transform to allow overlap as well as after the GPU NB
          // launch to avoid FFT launch overhead hijacking the CPU and delaying
          // the nonbonded kernel.
-        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle, useGpuFPmeReduction);
      }
  
      /* Communicate coordinates and sum dipole if necessary +
@@ -1327,9 +1333,7 @@ void do_force(FILE                                     *fplog,
           * communication with calculation with domain decomposition.
           */
          wallcycle_stop(wcycle, ewcFORCE);
-        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f(),
-                                      BufferOpsUseGpu::False,
-                                      GpuBufferOpsAccumulateForce::Null);
+        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f());
  
          wallcycle_start_nocount(wcycle, ewcFORCE);
  
@@ -1365,20 +1369,8 @@ void do_force(FILE                                     *fplog,
                           flags, &forceOut.forceWithVirial(), enerd,
                           ed, bNS);
  
-    // flag to specify if CPU force output is preset in force
-    // buffer. For now, this is true even when useGpuPme == true
-    // (because on-GPU PME-nonbonded reduction will be added in
-    // follow-up)
-    // TODO adapt the below when on-GPU PME-nonbonded reduction is available.
-    bool                   useCpuPmeReduction = true;
-    bool                   haveCpuForces      = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuPmeReduction);
-    // flag to specify if forces should be accumulated in force buffer
-    // ops. For now, this is solely determined by above haveCpuForces
-    // flag, but in future developments it will also depend on
-    // e.g. whether the GPU force halo exchange is active.
-    GpuBufferOpsAccumulateForce accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
-        haveCpuForces ? GpuBufferOpsAccumulateForce::True :
-        GpuBufferOpsAccumulateForce::False;
+    bool                   useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuFPmeReduction;
+    bool                   haveCpuForces       = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuFPmeReduction);
  
      // Will store the amount of cycles spent waiting for the GPU that
      // will be later used in the DLB accounting.
@@ -1410,8 +1402,15 @@ void do_force(FILE                                     *fplog,
              {
                  nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
              }
+
+            // flag to specify if forces should be accumulated in force buffer
+            // ops. For non-local part, this just depends on whether CPU forces are present.
+            bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces;
              nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
-                                          forceOut.f(), useGpuFBufOps, accumulateForce);
+                                          forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+                                          pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                          useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
              if (useGpuFBufOps == BufferOpsUseGpu::True)
              {
                  nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
@@ -1456,7 +1455,7 @@ void do_force(FILE                                     *fplog,
  
      if (!alternateGpuWait && useGpuPme)
      {
-        pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd);
+        pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd, useGpuFPmeReduction);
      }
  
      /* Wait for local GPU NB outputs on the non-alternating wait path */
@@ -1515,12 +1514,22 @@ void do_force(FILE                                     *fplog,
          // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
          //   before the next CPU task that consumes the forces: vsite spread or update)
          //
-        if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
+        if (useGpuFBufOps == BufferOpsUseGpu::True && (haveCpuForces || DOMAINDECOMP(cr)))
          {
              nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
          }
+        // flag to specify if forces should be accumulated in force
+        // buffer ops. For local part, this depends on whether CPU
+        // forces are present, or if DD is active (in which case the
+        // halo exchange has resulted in contributions from the
+        // non-local part).
+        bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+            (haveCpuForces || DOMAINDECOMP(cr));
          nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                      forceOut.f(), useGpuFBufOps, accumulateForce);
+                                      forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+                                      pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                      useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
          if (useGpuFBufOps == BufferOpsUseGpu::True)
          {
              nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index 8288a1d22773a50f49b320c5c2cf77bff5395ac6..dd0ccb2ce731548845d14fb1c0712461cef01a93 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1441,12 +1441,15 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat,
  
  /* Add the force array(s) from nbnxn_atomdata_t to f */
  template <bool  useGpu>
-void reduceForces(nbnxn_atomdata_t                   *nbat,
-                  const Nbnxm::AtomLocality           locality,
-                  const Nbnxm::GridSet               &gridSet,
-                  rvec                               *f,
-                  gmx_nbnxn_gpu_t                    *gpu_nbv,
-                  GpuBufferOpsAccumulateForce         accumulateForce)
+void reduceForces(nbnxn_atomdata_t                *nbat,
+                  const Nbnxm::AtomLocality        locality,
+                  const Nbnxm::GridSet            &gridSet,
+                  rvec                            *f,
+                  void                            *pmeFDeviceBuffer,
+                  GpuEventSynchronizer            *pmeForcesReady,
+                  gmx_nbnxn_gpu_t                 *gpu_nbv,
+                  bool                             useGpuFPmeReduction,
+                  bool                             accumulateForce)
  {
      int a0 = 0;
      int na = 0;
@@ -1463,9 +1466,11 @@ void reduceForces(nbnxn_atomdata_t                   *nbat,
      {
          Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
                                           gpu_nbv,
+                                         pmeFDeviceBuffer,
+                                         pmeForcesReady,
                                           a0, na,
+                                         useGpuFPmeReduction,
                                           accumulateForce);
-
      }
      else
      {
@@ -1510,16 +1515,22 @@ void reduceForces<true>(nbnxn_atomdata_t             *nbat,
                          const Nbnxm::AtomLocality     locality,
                          const Nbnxm::GridSet         &gridSet,
                          rvec                         *f,
+                        void                         *fpme,
+                        GpuEventSynchronizer         *pmeForcesReady,
                          gmx_nbnxn_gpu_t              *gpu_nbv,
-                        GpuBufferOpsAccumulateForce   accumulateForce);
+                        bool                          useGpuFPmeReduction,
+                        bool                          accumulateForce);
  
  template
  void reduceForces<false>(nbnxn_atomdata_t             *nbat,
                           const Nbnxm::AtomLocality     locality,
                           const Nbnxm::GridSet         &gridSet,
                           rvec                         *f,
+                         void                         *fpme,
+                         GpuEventSynchronizer         *pmeForcesReady,
                           gmx_nbnxn_gpu_t              *gpu_nbv,
-                         GpuBufferOpsAccumulateForce   accumulateForce);
+                         bool                          useGpuFPmeReduction,
+                         bool                          accumulateForce);
  
  void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
                                                rvec                   *fshift)
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h

index d627732014cfaeb8bfba904722c1887d76c13e68..e34c8bd55898c677312e182d9fd157208da6bf39 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -59,14 +59,14 @@ struct tMPI_Atomic;
  
  enum class BufferOpsUseGpu;
  
+class GpuEventSynchronizer;
+
  namespace Nbnxm
  {
  class GridSet;
  enum class KernelType;
  }
  
-enum class GpuBufferOpsAccumulateForce;
-
  /* Convenience type for vector with aligned memory */
  template<typename T>
  using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>;
@@ -344,8 +344,11 @@ void reduceForces(nbnxn_atomdata_t                   *nbat,
                    Nbnxm::AtomLocality                 locality,
                    const Nbnxm::GridSet               &gridSet,
                    rvec                               *f,
+                  void                               *pmeFDeviceBuffer,
+                  GpuEventSynchronizer               *pmeForcesReady,
                    gmx_nbnxn_gpu_t                    *gpu_nbv,
-                  GpuBufferOpsAccumulateForce         accumulateForce);
+                  bool                                useGpuFPmeReduction,
+                  bool                                accumulateForce);
  
  
  extern template
@@ -353,16 +356,22 @@ void reduceForces<true>(nbnxn_atomdata_t             *nbat,
                          const Nbnxm::AtomLocality     locality,
                          const Nbnxm::GridSet         &gridSet,
                          rvec                         *f,
+                        void                         *pmeFDeviceBuffer,
+                        GpuEventSynchronizer         *pmeForcesReady,
                          gmx_nbnxn_gpu_t              *gpu_nbv,
-                        GpuBufferOpsAccumulateForce   accumulateForce);
+                        bool                          useGpuFPmeReduction,
+                        bool                          accumulateForce);
  
  extern template
  void reduceForces<false>(nbnxn_atomdata_t             *nbat,
                           const Nbnxm::AtomLocality     locality,
                           const Nbnxm::GridSet         &gridSet,
                           rvec                         *f,
+                         void                         *pmeFDeviceBuffer,
+                         GpuEventSynchronizer         *pmeForcesReady,
                           gmx_nbnxn_gpu_t              *gpu_nbv,
-                         GpuBufferOpsAccumulateForce   accumulateForce);
+                         bool                          useGpuFPmeReduction,
+                         bool                          accumulateForce);
  
  /* Add the fshift force stored in nbat to fshift */
  void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh

index 09d6e3724381a060f7883edbbac0bd62272e68f4..bc0d2c2c5e81e1dda9f0ec8bbb66b4bc4ea9296f 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
@@ -139,24 +139,26 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
  
  /*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f
   *
- * \param[in]     fnb     Force in nbat format
- * \param[in,out] f       Force buffer to be reduced into
- * \param[in]     cell    Cell index mapping
- * \param[in]     a0      start atom index
- * \param[in]     a1      end atom index
- * \param[in]     stride  stride between atoms in memory
+ * \param[in]     fnb              Force in nbat format
+ * \param[in]     fPmeDeviceBuffer PME force
+ * \param[in,out] f                Force buffer to be reduced into
+ * \param[in]     cell             Cell index mapping
+ * \param[in]     atomStart        Start atom index
+ * \param[in]     nAtoms           Number of Atoms
   */
-template <bool accumulateForce>
+template <bool accumulateForce, bool addPmeF>
  __global__ void
  nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 rvec                     * f,
+                                 const float3 *__restrict__ fPmeDeviceBuffer,
+                                 float3                   * f,
                                   const int *__restrict__    cell,
                                   const int                  atomStart,
                                   const int                  nAtoms);
-template <bool accumulateForce>
+template <bool accumulateForce, bool addPmeF>
  __global__ void
  nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 rvec                     * f,
+                                 const float3 *__restrict__ fPmeDeviceBuffer,
+                                 float3                   * f,
                                   const int *__restrict__    cell,
                                   const int                  atomStart,
                                   const int                  nAtoms)
@@ -170,16 +172,23 @@ nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
      {
  
          int     i        = cell[atomStart+threadIndex];
-        float3 *f_dest   = (float3 *)&f[atomStart+threadIndex][XX];
+        float3 *fDest    = (float3 *)&f[atomStart+threadIndex];
+        float3  temp;
  
          if (accumulateForce)
          {
-            *f_dest += fnb[i];
+            temp  = *fDest;
+            temp += fnb[i];
          }
          else
          {
-            *f_dest = fnb[i];
+            temp = fnb[i];
          }
+        if (addPmeF)
+        {
+            temp += fPmeDeviceBuffer[atomStart+threadIndex];
+        }
+        *fDest = temp;
  
      }
      return;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index ec16b1b01ca41c1379e80cb86f973c8735c9ca9d..74925428ac79994726cdf9d441186172b152b1d4 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -54,6 +54,7 @@
  #include "nbnxm_cuda.h"
  
  #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/vectype_ops.cuh"
  #include "gromacs/mdlib/force_flags.h"
  #include "gromacs/nbnxm/atomdata.h"
@@ -851,18 +852,27 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
  }
  
  /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                  atomLocality,
-                               gmx_nbnxn_gpu_t                    *nb,
-                               int                                 atomStart,
-                               int                                 nAtoms,
-                               GpuBufferOpsAccumulateForce         accumulateForce)
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
+                               gmx_nbnxn_gpu_t                 *nb,
+                               void                            *fPmeDevicePtr,
+                               GpuEventSynchronizer            *pmeForcesReady,
+                               int                              atomStart,
+                               int                              nAtoms,
+                               bool                             useGpuFPmeReduction,
+                               bool                             accumulateForce)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
      const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
      cudaStream_t              stream    = nb->stream[iLocality];
+    cu_atomdata_t            *adat      = nb->atdat;
+    bool                      addPmeF   = useGpuFPmeReduction;
  
-    cu_atomdata_t            *adat    = nb->atdat;
+    if (addPmeF)
+    {
+        //Stream must wait for PME force completion
+        pmeForcesReady->enqueueWaitEvent(stream);
+    }
  
      /* launch kernel */
  
@@ -876,16 +886,27 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                  atomLocality,
      config.sharedMemorySize = 0;
      config.stream           = stream;
  
-    auto              kernelFn = (accumulateForce == GpuBufferOpsAccumulateForce::True) ?
-        nbnxn_gpu_add_nbat_f_to_f_kernel<true> : nbnxn_gpu_add_nbat_f_to_f_kernel<false>;
-    const float3     *fPtr                    = adat->f;
-    rvec             *frvec                   = nb->frvec;
-    const int        *cell                    = nb->cell;
+    auto  kernelFn = accumulateForce ?
+        nbnxn_gpu_add_nbat_f_to_f_kernel<true, false> :
+        nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
+
+    if (addPmeF)
+    {
+        kernelFn = accumulateForce ?
+            nbnxn_gpu_add_nbat_f_to_f_kernel<true, true> :
+            nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
+    }
+
+    const float3     *d_f     = adat->f;
+    float3           *d_fNB   = (float3*) nb->frvec;
+    const float3     *d_fPme  = (float3*) fPmeDevicePtr;
+    const int        *d_cell  = nb->cell;
  
      const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
-                                                               &fPtr,
-                                                               &frvec,
-                                                               &cell,
+                                                               &d_f,
+                                                               &d_fPme,
+                                                               &d_fNB,
+                                                               &d_cell,
                                                                 &atomStart,
                                                                 &nAtoms);
  
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 77119044cbcd08cc30362feb737b8c6e5ce4d0ea..dfe8c4e936b24168e23f2d1b48336515ab535ed9 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -158,17 +158,43 @@ gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
      return pairSearch_->gridSet().cells();
  }
  
+void
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
+                                             rvec                               *f)
+{
+
+    /* Skip the reduction if there was no short-range GPU work to do
+     * (either NB or both NB and bonded work). */
+    if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+    {
+        return;
+    }
+
+    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+
+    reduceForces<false>(nbat.get(), locality, pairSearch_->gridSet(), f, nullptr, nullptr, gpu_nbv, false, false);
+
+    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+}
+
  void
  nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
                                               rvec                               *f,
+                                             void                               *fPmeDeviceBuffer,
+                                             GpuEventSynchronizer               *pmeForcesReady,
                                               BufferOpsUseGpu                     useGpu,
-                                             GpuBufferOpsAccumulateForce         accumulateForce)
+                                             bool                                useGpuFPmeReduction,
+                                             bool                                accumulateForce)
  {
  
-    GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
-                 (accumulateForce == GpuBufferOpsAccumulateForce::True)),
+    GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && accumulateForce),
                 "Accumulatation of force is only valid when GPU buffer ops are active");
  
+    GMX_ASSERT((useGpuFPmeReduction == (fPmeDeviceBuffer != nullptr)),
+               "GPU PME force reduction is only valid when a non-null GPU PME force pointer is available");
+
      /* Skip the reduction if there was no short-range GPU work to do
       * (either NB or both NB and bonded work). */
      if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
@@ -180,7 +206,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
      wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
  
      auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
-    fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
+    fn(nbat.get(), locality, pairSearch_->gridSet(), f, fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
  
      wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index a749b074b9f468f8333519cf79799705b984f4a5..cf0b2f62ab9765714725e8134c84866533a1c711 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -135,14 +135,7 @@ enum class BufferOpsUseGpu
      False
  };
  
-/*! \brief Switch for whether forces should accumulate in GPU buffer ops */
-enum class GpuBufferOpsAccumulateForce
-{
-    True,  // Force should be accumulated and format converted
-    False, // Force should be not accumulated, just format converted
-    Null   // GPU buffer ops are not in use, so this object is not applicable
-};
-
+class GpuEventSynchronizer;
  
  namespace gmx
  {
@@ -310,11 +303,29 @@ struct nonbonded_verlet_t
                                        int                         forceFlags,
                                        t_nrnb                     *nrnb);
  
-        //! Add the forces stored in nbat to f, zeros the forces in nbat */
+        /*! \brief Add the forces stored in nbat to f, zeros the forces in nbat
+         * \param [in] locality         Local or non-local
+         * \param [inout] f             Force to be added to
+         */
+        void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
+                                      rvec                               *f);
+
+        /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active
+         * \param [in] locality         Local or non-local
+         * \param [inout] f             Force to be added to
+         * \param [in] fPme             Force from PME calculation
+         * \param [in] pmeForcesReady   Event triggered when PME force calculation has completed
+         * \param [in] useGpu           Whether GPU buffer ops are active
+         * \param [in] useGpuFPmeReduction   Whether PME force reduction is on GPU
+         * \param [in] accumulateForce  Whether force should be accumulated or stored
+         */
          void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
                                        rvec                               *f,
+                                      void                               *fPme,
+                                      GpuEventSynchronizer               *pmeForcesReady,
                                        BufferOpsUseGpu                     useGpu,
-                                      GpuBufferOpsAccumulateForce         accumulateForce);
+                                      bool                                useGpuFPmeReduction,
+                                      bool                                accumulateForce);
  
          /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
          void atomdata_init_add_nbat_f_to_f_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index b213ff4c07c117fbac44a05ea0c192b961f403b1..0f0e8de218035a8b1aa83e4f73b9b64a9363482f 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -54,7 +54,6 @@
  
  struct nbnxn_atomdata_t;
  enum class GpuTaskCompletion;
-enum class GpuBufferOpsAccumulateForce;
  
  namespace gmx
  {
@@ -280,9 +279,12 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
  CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality           gmx_unused  atomLocality,
                                 gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
+                               void                         gmx_unused *fPmeDevicePtr,
+                               GpuEventSynchronizer         gmx_unused *pmeForcesReady,
                                 int                          gmx_unused  atomStart,
                                 int                          gmx_unused  nAtoms,
-                               GpuBufferOpsAccumulateForce     gmx_unused  accumulateForce) CUDA_FUNC_TERM;
+                               bool                         gmx_unused  useGpuFPmeReduction,
+                               bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
  
  /*! \brief Copy force buffer from CPU to GPU */
  CUDA_FUNC_QUALIFIER
author	Alan Gray <alangray3@gmail.com>
	Fri, 10 May 2019 10:28:58 +0000 (03:28 -0700)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 15 Aug 2019 14:22:50 +0000 (16:22 +0200)
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types_host_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history