Merge release-5-0 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.c
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c

index 30de5e9b37bce73f8682210399bd5c5978d24572..b35dd827ac0afe009787c7e562d0baebeb264322 100644 (file)
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -36,70 +36,69 @@
   */
  #include "gmxpre.h"
  
+#include "gromacs/legacyheaders/sim_util.h"
+
  #include "config.h"
  
  #include <assert.h>
  #include <math.h>
  #include <stdio.h>
  #include <string.h>
+
  #ifdef HAVE_SYS_TIME_H
  #include <sys/time.h>
  #endif
  
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/legacyheaders/names.h"
-#include "gromacs/legacyheaders/txtdump.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/legacyheaders/chargegroup.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/legacyheaders/nrnb.h"
-#include "gromacs/legacyheaders/mdrun.h"
-#include "gromacs/legacyheaders/sim_util.h"
-#include "gromacs/legacyheaders/update.h"
-#include "gromacs/math/units.h"
-#include "gromacs/legacyheaders/mdatoms.h"
-#include "gromacs/legacyheaders/force.h"
-#include "gromacs/legacyheaders/pme.h"
-#include "gromacs/legacyheaders/disre.h"
-#include "gromacs/legacyheaders/orires.h"
-#include "gromacs/legacyheaders/network.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
+#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
+#include "gromacs/imd/imd.h"
  #include "gromacs/legacyheaders/calcmu.h"
+#include "gromacs/legacyheaders/chargegroup.h"
  #include "gromacs/legacyheaders/constr.h"
  #include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/disre.h"
  #include "gromacs/legacyheaders/domdec.h"
+#include "gromacs/legacyheaders/force.h"
  #include "gromacs/legacyheaders/genborn.h"
-#include "nbnxn_atomdata.h"
-#include "nbnxn_search.h"
-#include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
-#include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
-#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/mdatoms.h"
+#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
  #include "gromacs/legacyheaders/nonbonded.h"
-#include "../gmxlib/nonbonded/nb_kernel.h"
-#include "../gmxlib/nonbonded/nb_free_energy.h"
-
-#include "gromacs/bonded/bonded.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/orires.h"
+#include "gromacs/legacyheaders/pme.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/update.h"
  #include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/listed-forces/bonded.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h"
  #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
+#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/smalloc.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/imd/imd.h"
-#include "adress.h"
-#include "gromacs/legacyheaders/qmmm.h"
-
-#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
-
-#include "nbnxn_cuda/nbnxn_cuda.h"
  
-#include "nb_verlet.h"
+#include "adress.h"
  
  void print_time(FILE                     *out,
                  gmx_walltime_accounting_t walltime_accounting,
@@ -1237,10 +1236,11 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
      }
  
      /* We calculate the non-bonded forces, when done on the CPU, here.
-     * We do this before calling do_force_lowlevel, as in there bondeds
-     * forces are calculated before PME, which does communication.
-     * With this order, non-bonded and bonded force calculation imbalance
-     * can be balanced out by the domain decomposition load balancing.
+     * We do this before calling do_force_lowlevel, because in that
+     * function, the listed forces are calculated before PME, which
+     * does communication.  With this order, non-bonded and listed
+     * force calculation imbalance can be balanced out by the domain
+     * decomposition load balancing.
       */
  
      if (!bUseOrEmulGPU)
@@ -1320,13 +1320,13 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          update_QMMMrec(cr, fr, x, mdatoms, box, top);
      }
  
-    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
      {
          posres_wrapper(flags, inputrec, nrnb, top, box, x,
                         enerd, lambda, fr);
      }
  
-    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
      {
          fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
      }
@@ -1399,6 +1399,16 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (bDoForces && DOMAINDECOMP(cr))
      {
+        if (bUseGPU)
+        {
+            /* We are done with the CPU compute, but the GPU local non-bonded
+             * kernel can still be running while we communicate the forces.
+             * We start a counter here, so we can, hopefully, time the rest
+             * of the GPU kernel execution and data transfer.
+             */
+            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+        }
+
          /* Communicate the forces */
          wallcycle_start(wcycle, ewcMOVEF);
          dd_move_f(cr->dd, f, fr->fshift);
@@ -1429,13 +1439,44 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          /* wait for local forces (or calculate in emulation mode) */
          if (bUseGPU)
          {
+            float       cycles_tmp, cycles_wait_est;
+            const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
              nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                  nbv->grp[eintLocal].nbat,
                                  flags, eatLocal,
                                  enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                  fr->fshift);
-            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+            if (bDoForces && DOMAINDECOMP(cr))
+            {
+                cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+
+                if (cycles_tmp < cuda_api_overhead_margin)
+                {
+                    /* We measured few cycles, it could be that the kernel
+                     * and transfer finished earlier and there was no actual
+                     * wait time, only API call overhead.
+                     * Then the actual time could be anywhere between 0 and
+                     * cycles_wait_est. As a compromise, we use half the time.
+                     */
+                    cycles_wait_est *= 0.5f;
+                }
+            }
+            else
+            {
+                /* No force communication so we actually timed the wait */
+                cycles_wait_est = cycles_tmp;
+            }
+            /* Even though this is after dd_move_f, the actual task we are
+             * waiting for runs asynchronously with dd_move_f and we usually
+             * have nothing to balance it with, so we can and should add
+             * the time to the force time for load balancing.
+             */
+            cycles_force    += cycles_wait_est;
+            cycles_wait_gpu += cycles_wait_est;
  
              /* now clear the GPU outputs while we finish the step on the CPU */
  
@@ -1869,13 +1910,13 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
          update_QMMMrec(cr, fr, x, mdatoms, box, top);
      }
  
-    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
      {
          posres_wrapper(flags, inputrec, nrnb, top, box, x,
                         enerd, lambda, fr);
      }
  
-    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+    if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
      {
          fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
      }