Reorganize search-time code in do_force_cutsVERLET
authorSzilárd Páll <pall.szilard@gmail.com>
Fri, 8 Mar 2019 14:18:52 +0000 (15:18 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 13 Mar 2019 21:25:47 +0000 (22:25 +0100)
Only code reordering done here in order to facilitate splitting out the
DD/search work from the force computation schedule.

Change-Id: I04b7fa97eed3a48b38389b9da4ac9be4b36c718a

src/gromacs/mdlib/sim_util.cpp

index cd6cb1ebc4bdaebaf6c96e35e7fc5b6afb5b8500..7b068f8bafc15a8563b8001590ecb14c73e9e2ed 100644 (file)
@@ -982,45 +982,36 @@ static void do_force_cutsVERLET(FILE *fplog,
         nbnxn_atomdata_set(nbv->nbat.get(), nbv->nbs.get(), mdatoms, fr->cginfo);
 
         wallcycle_stop(wcycle, ewcNS);
-    }
 
-    /* initialize the GPU atom data and copy shift vector */
-    if (bUseGPU)
-    {
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-
-        if (bNS)
+        /* initialize the GPU nbnxm atom data and bonded data structures */
+        if (bUseGPU)
         {
-            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
-        }
+            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
 
-        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
+            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
+            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            if (fr->gpuBonded)
+            {
+                /* Now we put all atoms on the grid, we can assign bonded
+                 * interactions to the GPU, where the grid order is
+                 * needed. Also the xq, f and fshift device buffers have
+                 * been reallocated if needed, so the bonded code can
+                 * learn about them. */
+                // TODO the xq, f, and fshift buffers are now shared
+                // resources, so they should be maintained by a
+                // higher-level object than the nb module.
+                fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
+                                                                      top->idef,
+                                                                      Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+                                                                      Nbnxm::gpu_get_f(nbv->gpu_nbv),
+                                                                      Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
+            }
 
-        if (bNS && fr->gpuBonded)
-        {
-            /* Now we put all atoms on the grid, we can assign bonded
-             * interactions to the GPU, where the grid order is
-             * needed. Also the xq, f and fshift device buffers have
-             * been reallocated if needed, so the bonded code can
-             * learn about them. */
-            // TODO the xq, f, and fshift buffers are now shared
-            // resources, so they should be maintained by a
-            // higher-level object than the nb module.
-            fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
-                                                                  top->idef,
-                                                                  Nbnxm::gpu_get_xq(nbv->gpu_nbv),
-                                                                  Nbnxm::gpu_get_f(nbv->gpu_nbv),
-                                                                  Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
+            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         }
 
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-    }
-
-    if (bNS)
-    {
         // Need to run after the GPU-offload bonded interaction lists
         // are set up to be able to determine whether there is bonded work.
         setupForceWorkload(ppForceWorkload,
@@ -1035,6 +1026,7 @@ static void do_force_cutsVERLET(FILE *fplog,
     /* do local pair search */
     if (bNS)
     {
+        // TODO: fuse this branch with the above bNS block
         wallcycle_start_nocount(wcycle, ewcNS);
         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
@@ -1057,6 +1049,7 @@ static void do_force_cutsVERLET(FILE *fplog,
         wallcycle_start(wcycle, ewcLAUNCH_GPU);
 
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
         Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
                                   Nbnxm::AtomLocality::Local,
                                   ppForceWorkload->haveGpuBondedWork);
@@ -1094,6 +1087,7 @@ static void do_force_cutsVERLET(FILE *fplog,
     {
         if (bNS)
         {
+            // TODO: fuse this branch with the above large bNS block
             wallcycle_start_nocount(wcycle, ewcNS);
             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             /* Note that with a GPU the launch overhead of the list transfer is not timed separately */