Pass the new gmx::ForceFlags to the nbnxm module

author Szilárd Páll <pall.szilard@gmail.com>

Fri, 30 Aug 2019 13:16:24 +0000 (15:16 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 9 Sep 2019 21:04:41 +0000 (23:04 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Fri, 30 Aug 2019 13:16:24 +0000 (15:16 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 9 Sep 2019 21:04:41 +0000 (23:04 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 171675e914a17f138bcba337c7a8ff673985514e..61ca2b0758881e2574edd22a6ae8231ecf5cb709 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -314,14 +314,15 @@ static void post_process_forces(const t_commrec       *cr,
  static void do_nb_verlet(t_forcerec                       *fr,
                           const interaction_const_t        *ic,
                           gmx_enerdata_t                   *enerd,
-                         const int                         flags,
+                         int                               legacyForceFlags,
+                         const gmx::ForceFlags            &forceFlags,
                           const Nbnxm::InteractionLocality  ilocality,
                           const int                         clearF,
                           const int64_t                     step,
                           t_nrnb                           *nrnb,
                           gmx_wallcycle_t                   wcycle)
  {
-    if (!(flags & GMX_FORCE_NONBONDED))
+    if (!(legacyForceFlags & GMX_FORCE_NONBONDED))
      {
          /* skip non-bonded calculation */
          return;
@@ -351,7 +352,7 @@ static void do_nb_verlet(t_forcerec                       *fr,
          }
      }
  
-    nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb);
+    nbv->dispatchNonbondedKernel(ilocality, *ic, legacyForceFlags, forceFlags, clearF, *fr, enerd, nrnb);
  }
  
  static inline void clear_rvecs_omp(int n, rvec v[])
@@ -644,7 +645,7 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
   * \param[in,out] pmedata          PME module data
   * \param[in,out] forceOutputs     Output buffer for the forces and virial
   * \param[in,out] enerd            Energy data structure results are reduced into
- * \param[in]     flags            Force flags
+ * \param[in]     forceFlags       Force schedule flags
   * \param[in]     pmeFlags         PME flags
   * \param[in]     wcycle           The wallcycle structure
   */
@@ -652,7 +653,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                                          gmx_pme_t                           *pmedata,
                                          gmx::ForceOutputs                   *forceOutputs,
                                          gmx_enerdata_t                      *enerd,
-                                        int                                  flags,
+                                        const gmx::ForceFlags               &forceFlags,
                                          int                                  pmeFlags,
                                          gmx_wallcycle_t                      wcycle)
  {
@@ -678,7 +679,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
          {
              GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
              isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
-                                                     flags, // FIXME remove this
+                                                     forceFlags,
                                                       Nbnxm::AtomLocality::Local,
                                                       enerd->grpp.ener[egLJSR].data(),
                                                       enerd->grpp.ener[egCOULSR].data(),
@@ -1149,7 +1150,7 @@ void do_force(FILE                                     *fplog,
  
          /* launch local nonbonded work on GPU */
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
+        do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                       step, nrnb, wcycle);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1211,7 +1212,7 @@ void do_force(FILE                                     *fplog,
  
              /* launch non-local nonbonded tasks on GPU */
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
@@ -1230,12 +1231,10 @@ void do_force(FILE                                     *fplog,
          if (havePPDomainDecomposition(cr))
          {
              Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      // FIXME
-                                      flags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+                                      forceFlags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
          }
          Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  // FIXME
-                                  flags, Nbnxm::AtomLocality::Local, copyBackNbForce);
+                                  forceFlags, Nbnxm::AtomLocality::Local, copyBackNbForce);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          if (forceWork.haveGpuBondedWork && forceFlags.computeEnergy)
@@ -1317,7 +1316,7 @@ void do_force(FILE                                     *fplog,
  
      if (!bUseOrEmulGPU)
      {
-        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
+        do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
                       step, nrnb, wcycle);
      }
  
@@ -1344,7 +1343,7 @@ void do_force(FILE                                     *fplog,
      {
          if (havePPDomainDecomposition(cr))
          {
-            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
          }
  
@@ -1405,7 +1404,7 @@ void do_force(FILE                                     *fplog,
              if (bUseGPU)
              {
                  cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                                               flags, Nbnxm::AtomLocality::NonLocal,
+                                                               forceFlags, Nbnxm::AtomLocality::NonLocal,
                                                                 enerd->grpp.ener[egLJSR].data(),
                                                                 enerd->grpp.ener[egCOULSR].data(),
                                                                 forceWithShiftForces.shiftForces(),
@@ -1414,7 +1413,7 @@ void do_force(FILE                                     *fplog,
              else
              {
                  wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
+                do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
                               step, nrnb, wcycle);
                  wallcycle_stop(wcycle, ewcFORCE);
              }
@@ -1470,7 +1469,7 @@ void do_force(FILE                                     *fplog,
      if (alternateGpuWait)
      {
          alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd,
-                                    flags, pmeFlags, wcycle);
+                                    forceFlags, pmeFlags, wcycle);
      }
  
      if (!alternateGpuWait && useGpuPme)
@@ -1489,7 +1488,7 @@ void do_force(FILE                                     *fplog,
          const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
          const float waitCycles               =
              Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                        flags, Nbnxm::AtomLocality::Local,
+                                        forceFlags, Nbnxm::AtomLocality::Local,
                                          enerd->grpp.ener[egLJSR].data(),
                                          enerd->grpp.ener[egCOULSR].data(),
                                          forceOut.forceWithShiftForces().shiftForces(),
@@ -1517,7 +1516,7 @@ void do_force(FILE                                     *fplog,
          // NOTE: emulation kernel is not included in the balancing region,
          // but emulation mode does not target performance anyway
          wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
+        do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local,
                       DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
                       step, nrnb, wcycle);
          wallcycle_stop(wcycle, ewcFORCE);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 74925428ac79994726cdf9d441186172b152b1d4..0b30c010b0042a9dacf7194e24f158492bdcdd89 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -56,7 +56,7 @@
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/vectype_ops.cuh"
-#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/ppforceworkload.h"
  #include "gromacs/nbnxm/atomdata.h"
  #include "gromacs/nbnxm/gpu_common.h"
  #include "gromacs/nbnxm/gpu_common_utils.h"
@@ -403,7 +403,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
     with this event in the non-local stream before launching the non-bonded kernel.
   */
  void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
-                       const int                  flags,
+                       const gmx::ForceFlags     &forceFlags,
                         const InteractionLocality  iloc)
  {
      cu_atomdata_t       *adat    = nb->atdat;
@@ -412,8 +412,6 @@ void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
      cu_timers_t         *t       = nb->timers;
      cudaStream_t         stream  = nb->stream[iloc];
  
-    bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
-    bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
      bool                 bDoTime     = nb->bDoTime;
  
      /* Don't launch the non-local kernel if there is no work to do.
@@ -488,10 +486,10 @@ void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
      auto       *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
      const auto  kernel      = select_nbnxn_kernel(nbp->eeltype,
                                                    nbp->vdwtype,
-                                                  bCalcEner,
+                                                  forceFlags.computeEnergy,
                                                    (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
                                                    nb->dev_info);
-    const auto kernelArgs  = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &bCalcFshift);
+    const auto kernelArgs  = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &forceFlags.computeVirial);
      launchGpuKernel(kernel, config, timingEvent, "k_calc_nb", kernelArgs);
  
      if (bDoTime)
@@ -645,7 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
  
  void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
                          nbnxn_atomdata_t       *nbatom,
-                        const int               flags,
+                        const gmx::ForceFlags  &forceFlags,
                          const AtomLocality      atomLocality,
                          const bool              copyBackNbForce)
  {
@@ -663,9 +661,6 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
      bool             bDoTime = nb->bDoTime;
      cudaStream_t     stream  = nb->stream[iloc];
  
-    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
-    bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
-
      /* don't launch non-local copy-back if there was no non-local work to do */
      if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
      {
@@ -708,15 +703,15 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
      /* only transfer energies in the local stream */
      if (iloc == InteractionLocality::Local)
      {
-        /* DtoH fshift */
-        if (bCalcFshift)
+        /* DtoH fshift when virial is needed */
+        if (forceFlags.computeVirial)
          {
              cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
                                SHIFTS * sizeof(*nb->nbst.fshift), stream);
          }
  
          /* DtoH energies */
-        if (bCalcEner)
+        if (forceFlags.computeEnergy)
          {
              cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
                                sizeof(*nb->nbst.e_lj), stream);
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h

index 3d7871c96900d91702d832559d3153270ca1d151..599c97edd4d249c5d9980b9abc601a1baf2205ce 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -58,7 +58,7 @@
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/listed_forces/gpubonded.h"
  #include "gromacs/math/vec.h"
-#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/ppforceworkload.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/timing/gpu_timing.h"
@@ -367,7 +367,7 @@ gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
  //TODO: move into shared source file with gmx_compile_cpp_as_cuda
  //NOLINTNEXTLINE(misc-definitions-in-headers)
  bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
-                         const int                 flags,
+                         const gmx::ForceFlags    &forceFlags,
                           const AtomLocality        aloc,
                           real                     *e_lj,
                           real                     *e_el,
@@ -410,13 +410,10 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
              gpuStreamSynchronize(nb->stream[iLocality]);
          }
  
-        bool calcEner   = (flags & GMX_FORCE_ENERGY) != 0;
-        bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
-
-        gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
+        gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, forceFlags.computeEnergy,
                                 nb->bDoTime != 0);
  
-        gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift,
+        gpu_reduce_staged_outputs(nb->nbst, iLocality, forceFlags.computeEnergy, forceFlags.computeVirial,
                                    e_lj, e_el, as_rvec_array(shiftForces.data()));
      }
  
@@ -438,7 +435,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
   * pruning flags.
   *
   * \param[in] nb The nonbonded data GPU structure
- * \param[in] flags Force flags
+ * \param[in]  forceFlags     Force schedule flags
   * \param[in] aloc Atom locality identifier
   * \param[out] e_lj Pointer to the LJ energy output to accumulate into
   * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
@@ -448,7 +445,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
   */
  //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
  float gpu_wait_finish_task(gmx_nbnxn_gpu_t         *nb,
-                           int                      flags,
+                           const gmx::ForceFlags   &forceFlags,
                             AtomLocality             aloc,
                             real                    *e_lj,
                             real                    *e_el,
@@ -459,7 +456,7 @@ float gpu_wait_finish_task(gmx_nbnxn_gpu_t         *nb,
          (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local) ? ewcWAIT_GPU_NB_L : ewcWAIT_GPU_NB_NL;
  
      wallcycle_start(wcycle, cycleCounter);
-    gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, shiftForces,
+    gpu_try_finish_task(nb, forceFlags, aloc, e_lj, e_el, shiftForces,
                          GpuTaskCompletion::Wait, wcycle);
      float waitTime = wallcycle_stop(wcycle, cycleCounter);
  
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp

index c608a660c22a0eaf8f9a8b196fc7b43bcb10eea1..1303a20e483d67467a635795deb1578efab32274 100644 (file)
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -464,7 +464,8 @@ static void accountFlops(t_nrnb                           *nrnb,
  void
  nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality,
                                              const interaction_const_t &ic,
-                                            int                        forceFlags,
+                                            int                        legacyForceFlags,
+                                            const gmx::ForceFlags     &forceFlags,
                                              int                        clearF,
                                              const t_forcerec          &fr,
                                              gmx_enerdata_t            *enerd,
@@ -482,7 +483,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
                               nbat.get(),
                               ic,
                               fr.shift_vec,
-                             forceFlags,
+                             legacyForceFlags,
                               clearF,
                               enerd->grpp.ener[egCOULSR].data(),
                               fr.bBHAM ?
@@ -499,7 +500,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
              nbnxn_kernel_gpu_ref(pairlistSet.gpuList(),
                                   nbat.get(), &ic,
                                   fr.shift_vec,
-                                 forceFlags,
+                                 legacyForceFlags,
                                   clearF,
                                   nbat->out[0].f,
                                   nbat->out[0].fshift.data(),
@@ -514,7 +515,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
  
      }
  
-    accountFlops(nrnb, pairlistSet, *this, ic, forceFlags);
+    accountFlops(nrnb, pairlistSet, *this, ic, legacyForceFlags);
  }
  
  void
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 86394f02ae242ffded0d731cb83928526176a5fc..5a6ff9dea4a2aac597d38c3055cab4788e612b31 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -286,7 +286,8 @@ struct nonbonded_verlet_t
          //! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU
          void dispatchNonbondedKernel(Nbnxm::InteractionLocality  iLocality,
                                       const interaction_const_t  &ic,
-                                     int                         forceFlags,
+                                     int                         legacyForceFlags,
+                                     const gmx::ForceFlags      &forceFlags,
                                       int                         clearF,
                                       const t_forcerec           &fr,
                                       gmx_enerdata_t             *enerd,
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 0fbd9e4690a9b68647fa66df0c6cf4b6e5a0727f..08f0aa9f96a36d16f4e80091e95ee86d79d29008 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -59,6 +59,7 @@ enum class GpuTaskCompletion;
  namespace gmx
  {
  class GpuBonded;
+class ForceFlags;
  }
  
  namespace Nbnxm
@@ -92,9 +93,9 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
   *
   */
  GPU_FUNC_QUALIFIER
-void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused     *nb,
-                       int gmx_unused                  flags,
-                       InteractionLocality gmx_unused  iloc) GPU_FUNC_TERM;
+void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused      *nb,
+                       const gmx::ForceFlags gmx_unused &forceFlags,
+                       InteractionLocality gmx_unused    iloc) GPU_FUNC_TERM;
  
  /*! \brief
   * Launch asynchronously the nonbonded prune-only kernel.
@@ -141,11 +142,11 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused     *nb,
   * (and energies/shift forces if required).
   */
  GPU_FUNC_QUALIFIER
-void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
-                        nbnxn_atomdata_t gmx_unused *nbatom,
-                        int              gmx_unused  flags,
-                        AtomLocality     gmx_unused  aloc,
-                        bool             gmx_unused  copyBackNbForce) GPU_FUNC_TERM;
+void gpu_launch_cpyback(gmx_nbnxn_gpu_t       gmx_unused *nb,
+                        nbnxn_atomdata_t      gmx_unused *nbatom,
+                        const gmx::ForceFlags gmx_unused  &forceFlags,
+                        AtomLocality          gmx_unused  aloc,
+                        bool                  gmx_unused  copyBackNbForce) GPU_FUNC_TERM;
  
  /*! \brief Attempts to complete nonbonded GPU task.
   *
@@ -174,25 +175,25 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
   *  force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
   *  the energy and Fshift contributions for some external/centralized reduction.
   *
- * \param[in]  nb     The nonbonded data GPU structure
- * \param[in]  flags  Force flags
- * \param[in]  aloc   Atom locality identifier
- * \param[out] e_lj   Pointer to the LJ energy output to accumulate into
- * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
+ * \param[in]  nb             The nonbonded data GPU structure
+ * \param[in]  forceFlags     Force schedule flags
+ * \param[in]  aloc           Atom locality identifier
+ * \param[out] e_lj           Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el           Pointer to the electrostatics energy output to accumulate into
   * \param[out] shiftForces    Shift forces buffer to accumulate into
   * \param[in]  completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
- * \param[out] wcycle Pointer to wallcycle data structure
- * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
+ * \param[out] wcycle         Pointer to wallcycle data structure
+ * \returns                   True if the nonbonded tasks associated with \p aloc locality have completed
   */
  GPU_FUNC_QUALIFIER
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
-                         int             gmx_unused   flags,
-                         AtomLocality    gmx_unused   aloc,
-                         real            gmx_unused  *e_lj,
-                         real            gmx_unused  *e_el,
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused           *nb,
+                         const gmx::ForceFlags gmx_unused     &forceFlags,
+                         AtomLocality    gmx_unused           aloc,
+                         real            gmx_unused          *e_lj,
+                         real            gmx_unused          *e_el,
                           gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces,
-                         GpuTaskCompletion gmx_unused completionKind,
-                         gmx_wallcycle    gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
+                         GpuTaskCompletion gmx_unused         completionKind,
+                         gmx_wallcycle    gmx_unused         *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
  
  /*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
   * transfers to finish.
@@ -202,7 +203,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
   * pruning flags.
   *
   * \param[in] nb The nonbonded data GPU structure
- * \param[in] flags Force flags
+ * \param[in]  forceFlags     Force schedule flags
   * \param[in] aloc Atom locality identifier
   * \param[out] e_lj Pointer to the LJ energy output to accumulate into
   * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
@@ -210,12 +211,12 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
   */
  GPU_FUNC_QUALIFIER
  float gpu_wait_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
-                           int             gmx_unused  flags,
-                           AtomLocality    gmx_unused  aloc,
-                           real            gmx_unused *e_lj,
-                           real            gmx_unused *e_el,
-                           gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
-                           gmx_wallcycle    gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
+                           const gmx::ForceFlags    gmx_unused &forceFlags,
+                           AtomLocality             gmx_unused  aloc,
+                           real                     gmx_unused *e_lj,
+                           real                     gmx_unused *e_el,
+                           gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces,
+                           gmx_wallcycle            gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
  
  /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
  GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 99a7829e29df983e782571f6fab14c6f9e339ba9..365b404ad60aeb135838d9b7f77a5bc80d9ac2a0 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -72,7 +72,7 @@
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/gpu_utils/oclutils.h"
  #include "gromacs/hardware/hw_info.h"
-#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/ppforceworkload.h"
  #include "gromacs/nbnxm/atomdata.h"
  #include "gromacs/nbnxm/gpu_common.h"
  #include "gromacs/nbnxm/gpu_common_utils.h"
@@ -468,7 +468,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
     are finished and synchronize with this event in the non-local stream.
   */
  void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
-                       const int                         flags,
+                       const gmx::ForceFlags            &forceFlags,
                         const Nbnxm::InteractionLocality  iloc)
  {
      cl_atomdata_t       *adat    = nb->atdat;
@@ -477,8 +477,6 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
      cl_timers_t         *t       = nb->timers;
      cl_command_queue     stream  = nb->stream[iloc];
  
-    bool                 bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
-    int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
      bool                 bDoTime     = (nb->bDoTime) != 0;
  
      cl_nbparam_params_t  nbparams_params;
@@ -548,17 +546,20 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
      const auto     kernel       = select_nbnxn_kernel(nb,
                                                        nbp->eeltype,
                                                        nbp->vdwtype,
-                                                      bCalcEner,
+                                                      forceFlags.computeEnergy,
                                                        (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
  
  
+    // The OpenCL kernel takes int as second to last argument because bool is
+    // not supported as a kernel argument type (sizeof(bool) is implementation defined).
+    const int computeFshift = forceFlags.computeVirial;
      if (useLjCombRule(nb->nbparam->vdwtype))
      {
          const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
                                                            &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
                                                            &adat->lj_comb,
                                                            &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
-                                                          &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+                                                          &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
  
          launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
      }
@@ -569,7 +570,7 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
                                                            &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
                                                            &adat->atom_types,
                                                            &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
-                                                          &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+                                                          &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
          launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
      }
  
@@ -733,9 +734,9 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
   */
  void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
                          struct nbnxn_atomdata_t                  *nbatom,
-                        const int                                 flags,
+                        const gmx::ForceFlags                    &forceFlags,
                          const AtomLocality                        aloc,
-                        const bool                    gmx_unused  copyBackNbForce)
+                        const bool                     gmx_unused copyBackNbForce)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
@@ -750,10 +751,6 @@ void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
      bool                      bDoTime = nb->bDoTime == CL_TRUE;
      cl_command_queue          stream  = nb->stream[iloc];
  
-    bool                      bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
-    int                       bCalcFshift = flags & GMX_FORCE_VIRIAL;
-
-
      /* don't launch non-local copy-back if there was no non-local work to do */
      if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
      {
@@ -806,15 +803,15 @@ void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
      /* only transfer energies in the local stream */
      if (iloc == InteractionLocality::Local)
      {
-        /* DtoH fshift */
-        if (bCalcFshift)
+        /* DtoH fshift when virial is needed */
+        if (forceFlags.computeVirial)
          {
              ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
                                 SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
          }
  
          /* DtoH energies */
-        if (bCalcEner)
+        if (forceFlags.computeEnergy)
          {
              ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
                                 sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
author	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 30 Aug 2019 13:16:24 +0000 (15:16 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 9 Sep 2019 21:04:41 +0000 (23:04 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/gpu_common.h		patch \| blob \| history
src/gromacs/nbnxm/kerneldispatch.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history