Use enum class for nbnxm locality

author Berk Hess <hess@kth.se>

Tue, 15 Jan 2019 08:35:04 +0000 (09:35 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Sun, 17 Feb 2019 18:25:33 +0000 (19:25 +0100)
author Berk Hess <hess@kth.se>
Tue, 15 Jan 2019 08:35:04 +0000 (09:35 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Sun, 17 Feb 2019 18:25:33 +0000 (19:25 +0100)
diff --git a/src/gromacs/domdec/partition.cpp b/src/gromacs/domdec/partition.cpp

index be726af14104d839aacbc961376ef41b849b2529..83ffe74855a82633455142c29b360b8769ce0134 100644 (file)
--- a/src/gromacs/domdec/partition.cpp
+++ b/src/gromacs/domdec/partition.cpp
@@ -3422,7 +3422,7 @@ void dd_partition_system(FILE                    *fplog,
                                    fr->cginfo,
                                    state_local->x,
                                    ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
-                                  fr->nbv->grp[eintLocal].kernel_type,
+                                  fr->nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                                    fr->nbv->nbat);
  
                  nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
diff --git a/src/gromacs/ewald/pme_load_balancing.cpp b/src/gromacs/ewald/pme_load_balancing.cpp

index 579fc3c7f9b4a67d2ad177e2c6ccf7cc5bf62377..778c7c2c4818e5d2ff0a7c268b9039a36382dfdd 100644 (file)
--- a/src/gromacs/ewald/pme_load_balancing.cpp
+++ b/src/gromacs/ewald/pme_load_balancing.cpp
@@ -825,7 +825,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
      /* We always re-initialize the tables whether they are used or not */
      init_interaction_const_tables(nullptr, ic, rtab);
  
-    nbnxn_gpu_pme_loadbal_update_param(nbv, ic, listParams);
+    Nbnxm::gpu_pme_loadbal_update_param(nbv, ic, listParams);
  
      if (!pme_lb->bSepPMERanks)
      {
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 26a6a50a5cdde9e5fdea15f6830b03852f387132..1dda3a2c67708d42dabe9a49e43752d421fa43d5 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2635,15 +2635,15 @@ void init_forcerec(FILE                             *fp,
              GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
          }
  
-        init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
-                       cr, hardwareInfo, deviceInfo,
-                       mtop, box);
+        Nbnxm::init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
+                              cr, hardwareInfo, deviceInfo,
+                              mtop, box);
  
          if (useGpuForBonded)
          {
              auto stream = DOMAINDECOMP(cr) ?
-                nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintNonlocal) :
-                nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintLocal);
+                Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) :
+                Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local);
              // TODO the heap allocation is only needed while
              // t_forcerec lacks a constructor.
              fr->gpuBonded = new gmx::GpuBonded(mtop->ffparams,
@@ -2685,7 +2685,7 @@ void free_gpu_resources(t_forcerec                          *fr,
      if (isPPrankUsingGPU)
      {
          /* free nbnxn data in GPU memory */
-        nbnxn_gpu_free(fr->nbv->gpu_nbv);
+        Nbnxm::gpu_free(fr->nbv->gpu_nbv);
          delete fr->gpuBonded;
          fr->gpuBonded = nullptr;
      }
diff --git a/src/gromacs/mdlib/resethandler.cpp b/src/gromacs/mdlib/resethandler.cpp

index 5269dfde4893674487d40fb8c853738ea2d27ffb..5697378346b2881b043ed0310c27aa6d724f12cc 100644 (file)
--- a/src/gromacs/mdlib/resethandler.cpp
+++ b/src/gromacs/mdlib/resethandler.cpp
@@ -175,7 +175,7 @@ bool ResetHandler::resetCountersImpl(
  
          if (use_GPU(nbv))
          {
-            nbnxn_gpu_reset_timings(nbv);
+            Nbnxm::gpu_reset_timings(nbv);
          }
  
          if (pme_gpu_task_enabled(pme))
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 2fd4038c55134c4f43ac6d3a564c07cc70935137..6dedbb795b4a78c9bf5664e3dc175846d0ef0b43 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -387,14 +387,15 @@ static void post_process_forces(const t_commrec           *cr,
      }
  }
  
-static void do_nb_verlet(t_forcerec *fr,
-                         const interaction_const_t *ic,
-                         gmx_enerdata_t *enerd,
-                         int flags, int ilocality,
-                         int clearF,
-                         int64_t step,
-                         t_nrnb *nrnb,
-                         gmx_wallcycle_t wcycle)
+static void do_nb_verlet(t_forcerec                       *fr,
+                         const interaction_const_t        *ic,
+                         gmx_enerdata_t                   *enerd,
+                         const int                         flags,
+                         const Nbnxm::InteractionLocality  ilocality,
+                         const int                         clearF,
+                         const int64_t                     step,
+                         t_nrnb                           *nrnb,
+                         gmx_wallcycle_t                   wcycle)
  {
      if (!(flags & GMX_FORCE_NONBONDED))
      {
@@ -859,11 +860,12 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
          {
              GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
              wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
-            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
-                                                    flags, eatLocal,
-                                                    haveOtherWork,
-                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                                    fshift, completionType);
+            isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
+                                                     flags,
+                                                     Nbnxm::AtomLocality::Local,
+                                                     haveOtherWork,
+                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                                     fshift, completionType);
              wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
              // To get the call count right, when the task finished we
              // issue a start/stop.
@@ -874,7 +876,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
                  wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                                 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
              }
          }
@@ -905,15 +907,15 @@ static inline void launchGpuRollingPruning(const t_commrec          *cr,
       */
      int  numRollingParts     = nbv->listParams->numRollingParts;
      GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
-    int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+    int  stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
      bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
      if (stepWithCurrentList > 0 &&
          stepWithCurrentList < inputrec->nstlist - 1 &&
          (stepIsEven || DOMAINDECOMP(cr)))
      {
-        nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
-                                          stepIsEven ? eintLocal : eintNonlocal,
-                                          numRollingParts);
+        Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+                                           stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
+                                           numRollingParts);
      }
  }
  
@@ -1084,7 +1086,7 @@ static void do_force_cutsVERLET(FILE *fplog,
                                nullptr, 0, mdatoms->homenr, -1,
                                fr->cginfo, x.unpaddedArrayRef(),
                                0, nullptr,
-                              nbv->grp[eintLocal].kernel_type,
+                              nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                                nbv->nbat);
              wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
          }
@@ -1093,7 +1095,7 @@ static void do_force_cutsVERLET(FILE *fplog,
              wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
              nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
                                         fr->cginfo, x.unpaddedArrayRef(),
-                                       nbv->grp[eintNonlocal].kernel_type,
+                                       nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                         nbv->nbat);
              wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
          }
@@ -1111,10 +1113,10 @@ static void do_force_cutsVERLET(FILE *fplog,
  
          if (bNS)
          {
-            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
+            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
          }
  
-        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
  
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
@@ -1130,9 +1132,9 @@ static void do_force_cutsVERLET(FILE *fplog,
              // higher-level object than the nb module.
              fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
                                                                    top->idef,
-                                                                  nbnxn_gpu_get_xq(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_f(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_fshift(nbv->gpu_nbv));
+                                                                  Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_f(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
              ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
          }
  
@@ -1142,35 +1144,38 @@ static void do_force_cutsVERLET(FILE *fplog,
      /* do local pair search */
      if (bNS)
      {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
+
          wallcycle_start_nocount(wcycle, ewcNS);
          wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
          nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
                              &top->excls,
                              nbv->listParams->rlistOuter,
                              nbv->min_ci_balanced,
-                            &nbv->grp[eintLocal].nbl_lists,
-                            eintLocal,
-                            nbv->grp[eintLocal].kernel_type,
+                            &pairlistSet,
+                            Nbnxm::InteractionLocality::Local,
+                            nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                              nrnb);
-        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+        pairlistSet.outerListCreationStep = step;
          if (nbv->listParams->useDynamicPruning && !bUseGPU)
          {
-            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+            nbnxnPrepareListForDynamicPruning(&pairlistSet);
          }
          wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
  
          if (bUseGPU)
          {
              /* initialize local pair-list on the GPU */
-            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                    nbv->grp[eintLocal].nbl_lists.nblGpu[0],
-                                    eintLocal);
+            Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                     pairlistSet.nblGpu[0],
+                                     Nbnxm::InteractionLocality::Local);
          }
          wallcycle_stop(wcycle, ewcNS);
      }
      else
      {
-        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
+                                        FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                          nbv->nbat, wcycle);
      }
  
@@ -1184,7 +1189,7 @@ static void do_force_cutsVERLET(FILE *fplog,
          wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
          wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          // bonded work not split into separate local and non-local, so with DD
@@ -1198,7 +1203,7 @@ static void do_force_cutsVERLET(FILE *fplog,
  
          /* launch local nonbonded work on GPU */
          wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                       step, nrnb, wcycle);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1217,6 +1222,8 @@ static void do_force_cutsVERLET(FILE *fplog,
         do non-local pair search */
      if (DOMAINDECOMP(cr))
      {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
+
          if (bNS)
          {
              wallcycle_start_nocount(wcycle, ewcNS);
@@ -1226,23 +1233,23 @@ static void do_force_cutsVERLET(FILE *fplog,
                                  &top->excls,
                                  nbv->listParams->rlistOuter,
                                  nbv->min_ci_balanced,
-                                &nbv->grp[eintNonlocal].nbl_lists,
-                                eintNonlocal,
-                                nbv->grp[eintNonlocal].kernel_type,
+                                &pairlistSet,
+                                Nbnxm::InteractionLocality::NonLocal,
+                                nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                  nrnb);
-            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+            pairlistSet.outerListCreationStep = step;
              if (nbv->listParams->useDynamicPruning && !bUseGPU)
              {
-                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+                nbnxnPrepareListForDynamicPruning(&pairlistSet);
              }
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
  
-            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
+            if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
              {
                  /* initialize non-local pair-list on the GPU */
-                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                        nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
-                                        eintNonlocal);
+                Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                         pairlistSet.nblGpu[0],
+                                         Nbnxm::InteractionLocality::NonLocal);
              }
              wallcycle_stop(wcycle, ewcNS);
          }
@@ -1250,7 +1257,8 @@ static void do_force_cutsVERLET(FILE *fplog,
          {
              dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
  
-            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
+                                            FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                              nbv->nbat, wcycle);
          }
  
@@ -1260,7 +1268,7 @@ static void do_force_cutsVERLET(FILE *fplog,
  
              /* launch non-local nonbonded tasks on GPU */
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
              if (ppForceWorkload->haveGpuBondedWork)
@@ -1271,7 +1279,7 @@ static void do_force_cutsVERLET(FILE *fplog,
              }
  
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
@@ -1286,11 +1294,11 @@ static void do_force_cutsVERLET(FILE *fplog,
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          if (DOMAINDECOMP(cr))
          {
-            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                     flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                      flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
          }
-        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                  flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
@@ -1397,7 +1405,7 @@ static void do_force_cutsVERLET(FILE *fplog,
  
      if (!bUseOrEmulGPU)
      {
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
                       step, nrnb, wcycle);
      }
  
@@ -1406,18 +1414,18 @@ static void do_force_cutsVERLET(FILE *fplog,
          /* Calculate the local and non-local free energy interactions here.
           * Happens here on the CPU both with and without GPU.
           */
-        if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
          {
-            do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
                               fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                               inputrec->fepvals, lambda,
                               enerd, flags, nrnb, wcycle);
          }
  
          if (DOMAINDECOMP(cr) &&
-            fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+            fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
          {
-            do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
                               fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                               inputrec->fepvals, lambda,
                               enerd, flags, nrnb, wcycle);
@@ -1426,22 +1434,14 @@ static void do_force_cutsVERLET(FILE *fplog,
  
      if (!bUseOrEmulGPU)
      {
-        int aloc;
-
          if (DOMAINDECOMP(cr))
          {
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
          }
  
-        if (!bUseOrEmulGPU)
-        {
-            aloc = eintLocal;
-        }
-        else
-        {
-            aloc = eintNonlocal;
-        }
+        const Nbnxm::InteractionLocality iloc =
+            (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
  
          /* Add all the non-bonded force to the normal force array.
           * This can be split into a local and a non-local part when overlapping
@@ -1449,13 +1449,13 @@ static void do_force_cutsVERLET(FILE *fplog,
           */
          wallcycle_stop(wcycle, ewcFORCE);
  
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
  
          wallcycle_start_nocount(wcycle, ewcFORCE);
  
          /* if there are multiple fshift output buffers reduce them */
          if ((flags & GMX_FORCE_VIRIAL) &&
-            nbv->grp[aloc].nbl_lists.nnbl > 1)
+            nbv->grp[iloc].nbl_lists.nnbl > 1)
          {
              /* This is not in a subcounter because it takes a
                 negligible and constant-sized amount of time */
@@ -1493,25 +1493,25 @@ static void do_force_cutsVERLET(FILE *fplog,
              if (bUseGPU)
              {
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                           flags, eatNonlocal,
-                                           ppForceWorkload->haveGpuBondedWork,
-                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                           fr->fshift);
+                Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                            flags, Nbnxm::AtomLocality::NonLocal,
+                                            ppForceWorkload->haveGpuBondedWork,
+                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                            fr->fshift);
                  cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
              }
              else
              {
                  wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+                do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
                               step, nrnb, wcycle);
                  wallcycle_stop(wcycle, ewcFORCE);
              }
  
              /* skip the reduction if there was no non-local work to do */
-            if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
+            if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
              {
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
                                                 nbv->nbat, f, wcycle);
              }
          }
@@ -1558,10 +1558,10 @@ static void do_force_cutsVERLET(FILE *fplog,
          const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
  
          wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                   flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
-                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                   fr->fshift);
+        Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                    flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                    fr->fshift);
          float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
          if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
@@ -1586,7 +1586,7 @@ static void do_force_cutsVERLET(FILE *fplog,
          // NOTE: emulation kernel is not included in the balancing region,
          // but emulation mode does not target performance anyway
          wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
                       DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
                       step, nrnb, wcycle);
          wallcycle_stop(wcycle, ewcFORCE);
@@ -1602,7 +1602,7 @@ static void do_force_cutsVERLET(FILE *fplog,
          /* now clear the GPU outputs while we finish the step on the CPU */
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
  
          /* Is dynamic pair-list pruning activated? */
          if (nbv->listParams->useDynamicPruning)
@@ -1633,7 +1633,7 @@ static void do_force_cutsVERLET(FILE *fplog,
       * on the non-alternating path. */
      if (bUseOrEmulGPU && !alternateGpuWait)
      {
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                         nbv->nbat, f, wcycle);
      }
      if (DOMAINDECOMP(cr))
@@ -2744,7 +2744,7 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
  
      if (printReport)
      {
-        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
          gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
          if (pme_gpu_task_enabled(pme))
          {
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index db4ad05dc6267ca3bb41aa6e0303d45515efbfd9..62ba0c401436fa47863b22194b52e4e66c7df061 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1004,12 +1004,12 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool          bDynamicBox,
  }
  
  /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
-void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
-                                     int                  locality,
-                                     gmx_bool             FillLocal,
-                                     rvec                *x,
-                                     nbnxn_atomdata_t    *nbat,
-                                     gmx_wallcycle       *wcycle)
+void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search       *nbs,
+                                     const Nbnxm::AtomLocality locality,
+                                     gmx_bool                  FillLocal,
+                                     rvec                     *x,
+                                     nbnxn_atomdata_t         *nbat,
+                                     gmx_wallcycle            *wcycle)
  {
      wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
@@ -1019,15 +1019,16 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
  
      switch (locality)
      {
-        case eatAll:
+        case Nbnxm::AtomLocality::All:
+        case Nbnxm::AtomLocality::Count:
              g0 = 0;
              g1 = nbs->grid.size();
              break;
-        case eatLocal:
+        case Nbnxm::AtomLocality::Local:
              g0 = 0;
              g1 = 1;
              break;
-        case eatNonlocal:
+        case Nbnxm::AtomLocality::NonLocal:
              g0 = 1;
              g1 = nbs->grid.size();
              break;
@@ -1467,11 +1468,11 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat,
  }
  
  /* Add the force array(s) from nbnxn_atomdata_t to f */
-void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
-                                    int                     locality,
-                                    nbnxn_atomdata_t       *nbat,
-                                    rvec                   *f,
-                                    gmx_wallcycle          *wcycle)
+void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search             *nbs,
+                                    const Nbnxm::AtomLocality locality,
+                                    nbnxn_atomdata_t         *nbat,
+                                    rvec                     *f,
+                                    gmx_wallcycle            *wcycle)
  {
      wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
@@ -1482,15 +1483,16 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
  
      switch (locality)
      {
-        case eatAll:
+        case Nbnxm::AtomLocality::All:
+        case Nbnxm::AtomLocality::Count:
              a0 = 0;
              na = nbs->natoms_nonlocal;
              break;
-        case eatLocal:
+        case Nbnxm::AtomLocality::Local:
              a0 = 0;
              na = nbs->natoms_local;
              break;
-        case eatNonlocal:
+        case Nbnxm::AtomLocality::NonLocal:
              a0 = nbs->natoms_local;
              na = nbs->natoms_nonlocal - nbs->natoms_local;
              break;
@@ -1500,7 +1502,7 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
  
      if (nbat->out.size() > 1)
      {
-        if (locality != eatAll)
+        if (locality != Nbnxm::AtomLocality::All)
          {
              gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
          }
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h

index ebfeee5bfbb35ba376505d401e48b61f2db74feb..bd210ea01444e107ed783a131748831213e86f08 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -42,6 +42,8 @@
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/real.h"
  
+#include "locality.h"
+
  namespace gmx
  {
  class MDLogger;
@@ -94,7 +96,7 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool          dynamic_box,
   * FillLocal tells if the local filler particle coordinates should be zeroed.
   */
  void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
-                                     int                  locality,
+                                     Nbnxm::AtomLocality  locality,
                                       gmx_bool             FillLocal,
                                       rvec                *x,
                                       nbnxn_atomdata_t    *nbat,
@@ -102,7 +104,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
  
  /* Add the forces stored in nbat to f, zeros the forces in nbat */
  void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
-                                    int                     locality,
+                                    Nbnxm::AtomLocality     locality,
                                      nbnxn_atomdata_t       *nbat,
                                      rvec                   *f,
                                      gmx_wallcycle          *wcycle);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 15eb634f282bebc59839101b3b8ba46f5a259afb..7ce9b9fa5e2aac51f1fed4b2c85529f19ab199fc 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -107,6 +107,9 @@
  #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  
  
+namespace Nbnxm
+{
+
  /*! Nonbonded kernel function pointer type */
  typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
                                       const cu_nbparam_t,
@@ -268,19 +271,24 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
  }
  
  /*! \brief Launch asynchronously the xq buffer host to device copy. */
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
-                              const nbnxn_atomdata_t *nbatom,
-                              int                     iloc,
-                              bool                    haveOtherWork)
+void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
+                        const nbnxn_atomdata_t *nbatom,
+                        const AtomLocality      atomLocality,
+                        const bool              haveOtherWork)
  {
-    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
+               "Only local and non-local xq transfers are supported");
  
-    cu_atomdata_t       *adat    = nb->atdat;
-    cu_plist_t          *plist   = nb->plist[iloc];
-    cu_timers_t         *t       = nb->timers;
-    cudaStream_t         stream  = nb->stream[iloc];
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
  
-    bool                 bDoTime     = nb->bDoTime;
+    int                       adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+
+    cu_atomdata_t            *adat    = nb->atdat;
+    cu_plist_t               *plist   = nb->plist[iloc];
+    cu_timers_t              *t       = nb->timers;
+    cudaStream_t              stream  = nb->stream[iloc];
+
+    bool                      bDoTime     = nb->bDoTime;
  
      /* Don't launch the non-local H2D copy if there is no dependent
         work to do: neither non-local nor other (e.g. bonded) work
@@ -291,7 +299,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
         we always call the local local x+q copy (and the rest of the local
         work in nbnxn_gpu_launch_kernel().
       */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -299,7 +307,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
      }
  
      /* calculate the atom data index range based on locality */
-    if (LOCAL_I(iloc))
+    if (atomLocality == AtomLocality::Local)
      {
          adat_begin  = 0;
          adat_len    = adat->natoms_local;
@@ -313,7 +321,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
      /* beginning of timed HtoD section */
      if (bDoTime)
      {
-        t->nb_h2d[iloc].openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
      }
  
      /* HtoD x, q */
@@ -323,7 +331,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
  
      if (bDoTime)
      {
-        t->nb_h2d[iloc].closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
      }
  
      /* When we get here all misc operations issued in the local stream as well as
@@ -334,7 +342,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
       */
      if (nb->bUseTwoStreams)
      {
-        if (iloc == eintLocal)
+        if (iloc == InteractionLocality::Local)
          {
              cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
              CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
@@ -364,9 +372,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
     the local x+q H2D (and all preceding) tasks are complete and synchronize
     with this event in the non-local stream before launching the non-bonded kernel.
   */
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
-                             int                     flags,
-                             int                     iloc)
+void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
+                       const int                  flags,
+                       const InteractionLocality  iloc)
  {
      /* CUDA kernel launch-related stuff */
      int                  nblock;
@@ -392,7 +400,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
         clearing. All these operations, except for the local interaction kernel,
         are needed for the non-local interactions. The skip of the local kernel
         call is taken care of later in this function. */
-    if (canSkipWork(nb, iloc))
+    if (canSkipWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -405,7 +413,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
             (TODO: ATM that's the way the timing accounting can distinguish between
             separate prune kernel and combined force+prune, maybe we need a better way?).
           */
-        nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
+        gpu_launch_kernel_pruneonly(nb, iloc, 1);
      }
  
      if (plist->nsci == 0)
@@ -417,14 +425,14 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
      /* beginning of timed nonbonded calculation section */
      if (bDoTime)
      {
-        t->nb_k[iloc].openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(stream);
      }
  
      /* get the pointer to the kernel flavor we need to use */
      nb_kernel = select_nbnxn_kernel(nbp->eeltype,
                                      nbp->vdwtype,
                                      bCalcEner,
-                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]),
+                                    (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
                                      nb->dev_info);
  
      /* Kernel launch config:
@@ -458,13 +466,13 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
                  config.sharedMemorySize);
      }
  
-    auto      *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+    auto      *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
      const auto kernelArgs  = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift);
      launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs);
  
      if (bDoTime)
      {
-        t->nb_k[iloc].closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(stream);
      }
  
      if (GMX_NATIVE_WINDOWS)
@@ -487,9 +495,9 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
      return shmem;
  }
  
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
-                                       int                     iloc,
-                                       int                     numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
+                                 const InteractionLocality  iloc,
+                                 const int                  numParts)
  {
      cu_atomdata_t       *adat    = nb->atdat;
      cu_nbparam_t        *nbp     = nb->nbparam;
@@ -544,7 +552,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
      GpuRegionTimer *timer = nullptr;
      if (bDoTime)
      {
-        timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
      }
  
      /* beginning of timed prune calculation section */
@@ -589,14 +597,14 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
         (combined or separate 1st pass prune, rolling prune). */
      if (plist->haveFreshList)
      {
-        plist->haveFreshList         = false;
+        plist->haveFreshList                   = false;
          /* Mark that pruning has been done */
-        nb->timers->didPrune[iloc] = true;
+        nb->timers->interaction[iloc].didPrune = true;
      }
      else
      {
          /* Mark that rolling pruning has been done */
-        nb->timers->didRollingPrune[iloc] = true;
+        nb->timers->interaction[iloc].didRollingPrune = true;
      }
  
      if (bDoTime)
@@ -611,18 +619,19 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
      }
  }
  
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
-                              nbnxn_atomdata_t       *nbatom,
-                              int                     flags,
-                              int                     aloc,
-                              bool                    haveOtherWork)
+void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
+                        nbnxn_atomdata_t       *nbatom,
+                        const int               flags,
+                        const AtomLocality      atomLocality,
+                        const bool              haveOtherWork)
  {
      cudaError_t stat;
      int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
  
      /* determine interaction locality from atom locality */
-    int              iloc = gpuAtomToInteractionLocality(aloc);
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
  
+    /* extract the data */
      cu_atomdata_t   *adat    = nb->atdat;
      cu_timers_t     *t       = nb->timers;
      bool             bDoTime = nb->bDoTime;
@@ -632,22 +641,22 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
      bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
      {
          return;
      }
  
-    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
+    getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len);
  
      /* beginning of timed D2H section */
      if (bDoTime)
      {
-        t->nb_d2h[iloc].openTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
      }
  
      /* With DD the local D2H transfer can only start after the non-local
         kernel has finished. */
-    if (iloc == eintLocal && nb->bUseTwoStreams)
+    if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
      {
          stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
          CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
@@ -661,14 +670,14 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
         recorded which signals that the local D2H can proceed. This event is not
         placed after the non-local kernel because we want the non-local data
         back first. */
-    if (iloc == eintNonlocal)
+    if (iloc == InteractionLocality::NonLocal)
      {
          stat = cudaEventRecord(nb->nonlocal_done, stream);
          CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
      }
  
      /* only transfer energies in the local stream */
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
      {
          /* DtoH fshift */
          if (bCalcFshift)
@@ -689,11 +698,11 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
  
      if (bDoTime)
      {
-        t->nb_d2h[iloc].closeTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
      }
  }
  
-void nbnxn_cuda_set_cacheconfig()
+void cuda_set_cacheconfig()
  {
      cudaError_t stat;
  
@@ -710,3 +719,5 @@ void nbnxn_cuda_set_cacheconfig()
          }
      }
  }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda.h

index f732da87d7586114d880c02b769207efa1b12b59..42296f62b2f16ceb2ab196971da89d12a1fdc686 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.h
@@ -36,10 +36,15 @@
   * \brief
   * Declares nbnxn cuda cache and texture helper functions
   */
-#ifndef GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
-#define GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
+#ifndef GMX_NBNXN_CUDA_NBNXN_CUDA_H
+#define GMX_NBNXN_CUDA_NBNXN_CUDA_H
+
+namespace Nbnxm
+{
  
  //! Set up the cache configuration for the non-bonded kernels.
-void nbnxn_cuda_set_cacheconfig();
+void cuda_set_cacheconfig();
+
+} // namespace Nbnxm
  
  #endif
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 537098d479882ddd5213d7583975897560a5001f..f75b1c080f45c6a8335a300a4d5cdd0137ffdc33 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -44,6 +44,11 @@
  #include <stdio.h>
  #include <stdlib.h>
  
+// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+//      is currently declared means this has to be before gpu_types.h
+#include "nbnxm_cuda_types.h"
+
+// TODO Remove this comment when the above order issue is resolved
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -63,7 +68,9 @@
  #include "gromacs/utility/smalloc.h"
  
  #include "nbnxm_cuda.h"
-#include "nbnxm_cuda_types.h"
+
+namespace Nbnxm
+{
  
  /* This is a heuristically determined parameter for the Kepler
   * and Maxwell architectures for the minimum size of ci lists by multiplying
@@ -330,11 +337,11 @@ static void init_nbparam(cu_nbparam_t                   *nbp,
  
  /*! Re-generate the GPU Ewald force table, resets rlist, and update the
   *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic,
-                                        const NbnxnListParameters   *listParams)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                  const interaction_const_t   *ic,
+                                  const NbnxnListParameters   *listParams)
  {
-    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+    if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU)
      {
          return;
      }
@@ -371,18 +378,6 @@ static void init_plist(cu_plist_t *pl)
      pl->haveFreshList  = false;
  }
  
-/*! Initializes the timer data structure. */
-static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
-{
-    /* The non-local counters/stream (second in the array) are needed only with DD. */
-    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
-    {
-        t->didPairlistH2D[i]  = false;
-        t->didPrune[i]        = false;
-        t->didRollingPrune[i] = false;
-    }
-}
-
  /*! Initializes the timings data structure. */
  static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
  {
@@ -408,10 +403,10 @@ static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
  }
  
  /*! Initializes simulation constant data. */
-static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
-                                  const interaction_const_t      *ic,
-                                  const NbnxnListParameters      *listParams,
-                                  const nbnxn_atomdata_t::Params &nbatParams)
+static void cuda_init_const(gmx_nbnxn_cuda_t               *nb,
+                            const interaction_const_t      *ic,
+                            const NbnxnListParameters      *listParams,
+                            const nbnxn_atomdata_t::Params &nbatParams)
  {
      init_atomdata_first(nb->atdat, nbatParams.numTypes);
      init_nbparam(nb->nbparam, ic, listParams, nbatParams);
@@ -420,13 +415,13 @@ static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
      nbnxn_cuda_clear_e_fshift(nb);
  }
  
-void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
-                    const gmx_device_info_t   *deviceInfo,
-                    const interaction_const_t *ic,
-                    const NbnxnListParameters *listParams,
-                    const nbnxn_atomdata_t    *nbat,
-                    int                        /*rank*/,
-                    gmx_bool                   bLocalAndNonlocal)
+void gpu_init(gmx_nbnxn_cuda_t         **p_nb,
+              const gmx_device_info_t   *deviceInfo,
+              const interaction_const_t *ic,
+              const NbnxnListParameters *listParams,
+              const nbnxn_atomdata_t    *nbat,
+              int                        /*rank*/,
+              gmx_bool                   bLocalAndNonlocal)
  {
      cudaError_t       stat;
      gmx_nbnxn_cuda_t *nb;
@@ -439,10 +434,10 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
      snew(nb, 1);
      snew(nb->atdat, 1);
      snew(nb->nbparam, 1);
-    snew(nb->plist[eintLocal], 1);
+    snew(nb->plist[InteractionLocality::Local], 1);
      if (bLocalAndNonlocal)
      {
-        snew(nb->plist[eintNonlocal], 1);
+        snew(nb->plist[InteractionLocality::NonLocal], 1);
      }
  
      nb->bUseTwoStreams = bLocalAndNonlocal;
@@ -455,17 +450,17 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
      pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
      pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
  
-    init_plist(nb->plist[eintLocal]);
+    init_plist(nb->plist[InteractionLocality::Local]);
  
      /* set device info, just point it to the right GPU among the detected ones */
      nb->dev_info = deviceInfo;
  
      /* local/non-local GPU streams */
-    stat = cudaStreamCreate(&nb->stream[eintLocal]);
-    CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
+    stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+    CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
      if (nb->bUseTwoStreams)
      {
-        init_plist(nb->plist[eintNonlocal]);
+        init_plist(nb->plist[InteractionLocality::NonLocal]);
  
          /* Note that the device we're running on does not have to support
           * priorities, because we are querying the priority range which in this
@@ -475,10 +470,10 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
          stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
          CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
  
-        stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
+        stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
                                              cudaStreamDefault,
                                              highest_priority);
-        CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
+        CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
      }
  
      /* init events for sychronization (timing disabled for performance reasons!) */
@@ -495,15 +490,14 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
  
      if (nb->bDoTime)
      {
-        init_timers(nb->timers, nb->bUseTwoStreams);
          init_timings(nb->timings);
      }
  
      /* set the kernel type for the current GPU */
      /* pick L1 cache configuration */
-    nbnxn_cuda_set_cacheconfig();
+    cuda_set_cacheconfig();
  
-    nbnxn_cuda_init_const(nb, ic, listParams, nbat->params());
+    cuda_init_const(nb, ic, listParams, nbat->params());
  
      *p_nb = nb;
  
@@ -513,9 +507,9 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
      }
  }
  
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
-                             const NbnxnPairlistGpu *h_plist,
-                             int                     iloc)
+void gpu_init_pairlist(gmx_nbnxn_cuda_t          *nb,
+                       const NbnxnPairlistGpu    *h_plist,
+                       const InteractionLocality  iloc)
  {
      char          sbuf[STRLEN];
      bool          bDoTime    =  (nb->bDoTime && !h_plist->sci.empty());
@@ -536,10 +530,12 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
          }
      }
  
+    gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+
      if (bDoTime)
      {
-        nb->timers->pl_h2d[iloc].openTimingRegion(stream);
-        nb->timers->didPairlistH2D[iloc] = true;
+        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.didPairlistH2D = true;
      }
  
      Context context = nullptr;
@@ -548,13 +544,13 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
                             &d_plist->nsci, &d_plist->sci_nalloc, context);
      copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
                         stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
                             &d_plist->ncj4, &d_plist->cj4_nalloc, context);
      copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
                         stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
                             &d_plist->nimask, &d_plist->imask_nalloc, context);
@@ -563,22 +559,22 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
                             &d_plist->nexcl, &d_plist->excl_nalloc, context);
      copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
                         stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
-        nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(stream);
      }
  
      /* the next use of thist list we be the first one, so we need to prune */
      d_plist->haveFreshList = true;
  }
  
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
-                               const nbnxn_atomdata_t *nbatom)
+void gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
+                         const nbnxn_atomdata_t *nbatom)
  {
      cu_atomdata_t *adat  = nb->atdat;
-    cudaStream_t   ls    = nb->stream[eintLocal];
+    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -594,7 +590,7 @@ static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
  {
      cudaError_t    stat;
      cu_atomdata_t *adat  = nb->atdat;
-    cudaStream_t   ls    = nb->stream[eintLocal];
+    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
  
      stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -605,7 +601,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
  {
      cudaError_t    stat;
      cu_atomdata_t *adat  = nb->atdat;
-    cudaStream_t   ls    = nb->stream[eintLocal];
+    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
  
      stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -615,7 +611,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
      CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
  }
  
-void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
+void gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
  {
      nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
      /* clear shift force array and energies if the outputs were
@@ -626,8 +622,8 @@ void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
      }
  }
  
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
-                             const nbnxn_atomdata_t        *nbat)
+void gpu_init_atomdata(gmx_nbnxn_cuda_t       *nb,
+                       const nbnxn_atomdata_t *nbat)
  {
      cudaError_t    stat;
      int            nalloc, natoms;
@@ -635,7 +631,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
      bool           bDoTime   = nb->bDoTime;
      cu_timers_t   *timers    = nb->timers;
      cu_atomdata_t *d_atdat   = nb->atdat;
-    cudaStream_t   ls        = nb->stream[eintLocal];
+    cudaStream_t   ls        = nb->stream[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
@@ -714,7 +710,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam)
      }
  }
  
-void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
+void gpu_free(gmx_nbnxn_cuda_t *nb)
  {
      cudaError_t      stat;
      cu_atomdata_t   *atdat;
@@ -773,7 +769,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
      freeDeviceBuffer(&atdat->lj_comb);
  
      /* Free plist */
-    auto *plist = nb->plist[eintLocal];
+    auto *plist = nb->plist[InteractionLocality::Local];
      freeDeviceBuffer(&plist->sci);
      freeDeviceBuffer(&plist->cj4);
      freeDeviceBuffer(&plist->imask);
@@ -781,7 +777,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
      sfree(plist);
      if (nb->bUseTwoStreams)
      {
-        auto *plist_nl = nb->plist[eintNonlocal];
+        auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
          freeDeviceBuffer(&plist_nl->sci);
          freeDeviceBuffer(&plist_nl->cj4);
          freeDeviceBuffer(&plist_nl->imask);
@@ -811,12 +807,12 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
  }
  
  //! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
+gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_cuda_t *nb)
  {
      return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
  }
  
-void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+void gpu_reset_timings(nonbonded_verlet_t* nbv)
  {
      if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
      {
@@ -824,44 +820,46 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
      }
  }
  
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
+int gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
  {
      return nb != nullptr ?
             gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
  
  }
  
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
  {
      return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
              (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
  }
  
-void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t *nb,
-                                   int              iloc)
+void *gpu_get_command_stream(gmx_nbnxn_gpu_t           *nb,
+                             const InteractionLocality  iloc)
  {
      assert(nb);
  
      return static_cast<void *>(&nb->stream[iloc]);
  }
  
-void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t *nb)
+void *gpu_get_xq(gmx_nbnxn_gpu_t *nb)
  {
      assert(nb);
  
      return static_cast<void *>(nb->atdat->xq);
  }
  
-void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t *nb)
+void *gpu_get_f(gmx_nbnxn_gpu_t *nb)
  {
      assert(nb);
  
      return static_cast<void *>(nb->atdat->f);
  }
  
-rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
+rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
  {
      assert(nb);
  
      return reinterpret_cast<rvec *>(nb->atdat->fshift);
  }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index 3d490c395a9cc3665be18380830918645b6ccda8..5a8380f5b5b7eb09c952e0941ca24f4ebd4a1c18 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -52,8 +52,10 @@
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/mdtypes/interaction_const.h"
  #include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/pairlist.h"
  #include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/enumerationhelpers.h"
  
  /*! \brief Macro definining default for the prune kernel's j4 processing concurrency.
   *
@@ -197,26 +199,26 @@ struct cu_nbparam
  /** \internal
   * \brief Pair list data.
   */
-using cu_plist_t = gpu_plist;
+using cu_plist_t = Nbnxm::gpu_plist;
  
  /** \internal
   * \brief Typedef of actual timer type.
   */
-typedef struct nbnxn_gpu_timers_t cu_timers_t;
+typedef struct Nbnxm::gpu_timers_t cu_timers_t;
  
  /** \internal
   * \brief Main data structure for CUDA nonbonded force calculations.
   */
  struct gmx_nbnxn_cuda_t
  {
-    const gmx_device_info_t  *dev_info;       /**< CUDA device information                              */
-    bool                      bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
-    cu_atomdata_t            *atdat;          /**< atom data                                            */
-    cu_nbparam_t             *nbparam;        /**< parameters required for the non-bonded calc.         */
-    cu_plist_t               *plist[2];       /**< pair-list data structures (local and non-local)      */
-    nb_staging_t              nbst;           /**< staging area where fshift/energies get downloaded    */
-
-    cudaStream_t              stream[2];      /**< local and non-local GPU streams                      */
+    const gmx_device_info_t                                        *dev_info;       /**< CUDA device information                              */
+    bool                                                            bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
+    cu_atomdata_t                                                  *atdat;          /**< atom data                                            */
+    cu_nbparam_t                                                   *nbparam;        /**< parameters required for the non-bonded calc.         */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t *> plist;          /**< pair-list data structures (local and non-local)      */
+    nb_staging_t                                                    nbst;           /**< staging area where fshift/energies get downloaded    */
+
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream;         /**< local and non-local GPU streams                      */
  
      /** events used for synchronization */
      cudaEvent_t    nonlocal_done;               /**< event triggered when the non-local non-bonded kernel
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h

index cd64c8ca1a8b31761c3e89e61dc3e2afe68bbdc8..1624c56c8c5bb91aba3248330e2e78857ab82148 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -67,6 +67,9 @@
  #include "gpu_common_utils.h"
  #include "nbnxm_gpu.h"
  
+namespace Nbnxm
+{
+
  /*! \brief Check that atom locality values are valid for the GPU module.
   *
   *  In the GPU module atom locality "all" is not supported, the local and
@@ -74,12 +77,16 @@
   *
   *  \param[in] atomLocality atom locality specifier
   */
-static inline void validateGpuAtomLocality(int atomLocality)
+static inline void
+validateGpuAtomLocality(const AtomLocality atomLocality)
  {
      std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
-                                        "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal);
+                                        "local (%d) or nonlocal (%d)",
+                                        static_cast<int>(atomLocality),
+                                        static_cast<int>(AtomLocality::Local),
+                                        static_cast<int>(AtomLocality::NonLocal));
  
-    GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str());
+    GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
  }
  
  /*! \brief Convert atom locality to interaction locality.
@@ -90,18 +97,19 @@ static inline void validateGpuAtomLocality(int atomLocality)
   *  \param[in] atomLocality Atom locality specifier
   *  \returns                Interaction locality corresponding to the atom locality passed.
   */
-static inline int gpuAtomToInteractionLocality(int atomLocality)
+static inline InteractionLocality
+gpuAtomToInteractionLocality(const AtomLocality atomLocality)
  {
      validateGpuAtomLocality(atomLocality);
  
      /* determine interaction locality from atom locality */
-    if (LOCAL_A(atomLocality))
+    if (atomLocality == AtomLocality::Local)
      {
-        return eintLocal;
+        return InteractionLocality::Local;
      }
-    else if (NONLOCAL_A(atomLocality))
+    else if (atomLocality == AtomLocality::NonLocal)
      {
-        return eintNonlocal;
+        return InteractionLocality::NonLocal;
      }
      else
      {
@@ -117,16 +125,17 @@ static inline int gpuAtomToInteractionLocality(int atomLocality)
   * \param[out] atomRangeLen Atom range length in the atom data array.
   */
  template <typename AtomDataT>
-static inline void getGpuAtomRange(const AtomDataT *atomData,
-                                   int              atomLocality,
-                                   int             *atomRangeBegin,
-                                   int             *atomRangeLen)
+static inline void
+getGpuAtomRange(const AtomDataT    *atomData,
+                const AtomLocality  atomLocality,
+                int                *atomRangeBegin,
+                int                *atomRangeLen)
  {
      assert(atomData);
      validateGpuAtomLocality(atomLocality);
  
      /* calculate the atom data index range based on locality */
-    if (LOCAL_A(atomLocality))
+    if (atomLocality == AtomLocality::Local)
      {
          *atomRangeBegin  = 0;
          *atomRangeLen    = atomData->natoms_local;
@@ -155,24 +164,27 @@ static inline void getGpuAtomRange(const AtomDataT *atomData,
  template <typename GpuTimers>
  static void countPruneKernelTime(GpuTimers                 *timers,
                                   gmx_wallclock_gpu_nbnxn_t *timings,
-                                 const int                  iloc)
+                                 const InteractionLocality  iloc)
  {
+    gpu_timers_t::Interaction &iTimers = timers->interaction[iloc];
+
      // We might have not done any pruning (e.g. if we skipped with empty domains).
-    if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
+    if (!iTimers.didPrune &&
+        !iTimers.didRollingPrune)
      {
          return;
      }
  
-    if (timers->didPrune[iloc])
+    if (iTimers.didPrune)
      {
          timings->pruneTime.c++;
-        timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
+        timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
      }
  
-    if (timers->didRollingPrune[iloc])
+    if (iTimers.didRollingPrune)
      {
          timings->dynamicPruneTime.c++;
-        timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
+        timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
      }
  }
  
@@ -195,16 +207,17 @@ static void countPruneKernelTime(GpuTimers                 *timers,
   * \param[out] fshift         Pointer to the array of shift forces to accumulate into
   */
  template <typename StagingData>
-static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
-                                                   int                iLocality,
-                                                   bool               reduceEnergies,
-                                                   bool               reduceFshift,
-                                                   real              *e_lj,
-                                                   real              *e_el,
-                                                   rvec              *fshift)
+static inline void
+gpu_reduce_staged_outputs(const StagingData         &nbst,
+                          const InteractionLocality  iLocality,
+                          const bool                 reduceEnergies,
+                          const bool                 reduceFshift,
+                          real                      *e_lj,
+                          real                      *e_el,
+                          rvec                      *fshift)
  {
      /* add up energies and shift forces (only once at local F wait) */
-    if (LOCAL_I(iLocality))
+    if (iLocality == InteractionLocality::Local)
      {
          if (reduceEnergies)
          {
@@ -244,12 +257,13 @@ static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
   *
   */
  template <typename GpuTimers, typename GpuPairlist>
-static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
-                                                GpuTimers                 *timers,
-                                                const GpuPairlist         *plist,
-                                                int                        atomLocality,
-                                                bool                       didEnergyKernels,
-                                                bool                       doTiming)
+static inline void
+gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
+                       GpuTimers                 *timers,
+                       const GpuPairlist         *plist,
+                       AtomLocality               atomLocality,
+                       bool                       didEnergyKernels,
+                       bool                       doTiming)
  {
      /* timing data accumulation */
      if (!doTiming)
@@ -258,10 +272,10 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
      }
  
      /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(atomLocality);
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
  
      /* only increase counter once (at local F wait) */
-    if (LOCAL_I(iLocality))
+    if (iLocality == InteractionLocality::Local)
      {
          timings->nb_c++;
          timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
@@ -269,11 +283,11 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
  
      /* kernel timings */
      timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
-        timers->nb_k[iLocality].getLastRangeTime();
+        timers->interaction[iLocality].nb_k.getLastRangeTime();
  
      /* X/q H2D and F D2H timings */
-    timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
-    timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
+    timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
+    timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
  
      /* Count the pruning kernel times for both cases:1st pass (at search step)
         and rolling pruning (if called at the previous step).
@@ -284,39 +298,41 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
      countPruneKernelTime(timers, timings, iLocality);
  
      /* only count atdat and pair-list H2D at pair-search step */
-    if (timers->didPairlistH2D[iLocality])
+    if (timers->interaction[iLocality].didPairlistH2D)
      {
          /* atdat transfer timing (add only once, at local F wait) */
-        if (LOCAL_A(atomLocality))
+        if (atomLocality == AtomLocality::Local)
          {
              timings->pl_h2d_c++;
              timings->pl_h2d_t += timers->atdat.getLastRangeTime();
          }
  
-        timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
+        timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
  
          /* Clear the timing flag for the next step */
-        timers->didPairlistH2D[iLocality] = false;
+        timers->interaction[iLocality].didPairlistH2D = false;
      }
  }
  
  //TODO: move into shared source file with gmx_compile_cpp_as_cuda
  //NOLINTNEXTLINE(misc-definitions-in-headers)
-bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
-                               int               flags,
-                               int               aloc,
-                               bool              haveOtherWork,
-                               real             *e_lj,
-                               real             *e_el,
-                               rvec             *fshift,
-                               GpuTaskCompletion completionKind)
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
+                         const int           flags,
+                         const AtomLocality  aloc,
+                         const bool          haveOtherWork,
+                         real               *e_lj,
+                         real               *e_el,
+                         rvec               *fshift,
+                         GpuTaskCompletion   completionKind)
  {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
      /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(aloc);
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
  
      //  We skip when during the non-local phase there was actually no work to do.
      //  This is consistent with nbnxn_gpu_launch_kernel.
-    if (haveOtherWork || !canSkipWork(nb, iLocality))
+    if (haveOtherWork || !canSkipWork(*nb, iLocality))
      {
          // Query the state of the GPU stream and return early if we're not done
          if (completionKind == GpuTaskCompletion::Check)
@@ -336,14 +352,14 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
          bool calcEner   = (flags & GMX_FORCE_ENERGY) != 0;
          bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
  
-        nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
-                                     nb->bDoTime != 0);
+        gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
+                               nb->bDoTime != 0);
  
-        nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
+        gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
      }
  
      /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
-    nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
+    nb->timers->interaction[iLocality].didPrune = nb->timers->interaction[iLocality].didRollingPrune = false;
  
      /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
      nb->plist[iLocality]->haveFreshList = false;
@@ -368,16 +384,18 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
   * \param[out] fshift Pointer to the shift force buffer to accumulate into
   */
  //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
-                                int              flags,
-                                int              aloc,
-                                bool             haveOtherWork,
-                                real            *e_lj,
-                                real            *e_el,
-                                rvec            *fshift)
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
+                          int              flags,
+                          AtomLocality     aloc,
+                          bool             haveOtherWork,
+                          real            *e_lj,
+                          real            *e_el,
+                          rvec            *fshift)
  {
-    nbnxn_gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
-                              GpuTaskCompletion::Wait);
+    gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
+                        GpuTaskCompletion::Wait);
  }
  
+} // namespace Nbnxm
+
  #endif
diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h

index 40bffbb7b3b94bd0de8b157d9f9a38f995fff34b..02febb47ab082b0570f4944971aecc9dbf87d1df 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common_utils.h
+++ b/src/gromacs/nbnxm/gpu_common_utils.h
@@ -54,6 +54,9 @@
  #include "opencl/nbnxm_ocl_types.h"
  #endif
  
+namespace Nbnxm
+{
+
  /*! \brief An early return condition for empty NB GPU workloads
   *
   * This is currently used for non-local kernels/transfers only.
@@ -61,10 +64,14 @@
   * local part of the force array also depends on the non-local kernel.
   * The skip of the local kernel is taken care of separately.
   */
-static inline bool canSkipWork(const gmx_nbnxn_gpu_t *nb, int iloc)
+static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb,
+                               InteractionLocality    iloc)
  {
-    assert(nb && nb->plist[iloc]);
-    return (iloc == eintNonlocal) && (nb->plist[iloc]->nsci == 0);
+    assert(nb.plist[iloc]);
+    return (iloc == InteractionLocality::NonLocal &&
+            nb.plist[iloc]->nsci == 0);
  }
  
+} // namespace Nbnxm
+
  #endif
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h

index 94d92a77e36a697285699e1bb3180c9ab1e67585..f578cf5cb99f12da5084357b8f6ca33c035440d0 100644 (file)
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -48,6 +48,7 @@
  #include "gromacs/mdtypes/interaction_const.h"
  
  #include "gpu_types.h"
+#include "locality.h"
  
  struct nonbonded_verlet_group_t;
  struct NbnxnPairlistGpu;
@@ -57,90 +58,95 @@ struct gmx_wallclock_gpu_nbnxn_t;
  struct gmx_gpu_info_t;
  struct gmx_device_info_t;
  
+namespace Nbnxm
+{
+
  /** Initializes the data structures related to GPU nonbonded calculations. */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
-                    const gmx_device_info_t gmx_unused     *deviceInfo,
-                    const interaction_const_t gmx_unused   *ic,
-                    const NbnxnListParameters gmx_unused   *listParams,
-                    const nbnxn_atomdata_t gmx_unused      *nbat,
-                    int gmx_unused                          rank,
-                    /* true if both local and non-local are done on GPU */
-                    gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
+void gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
+              const gmx_device_info_t gmx_unused     *deviceInfo,
+              const interaction_const_t gmx_unused   *ic,
+              const NbnxnListParameters gmx_unused   *listParams,
+              const nbnxn_atomdata_t gmx_unused      *nbat,
+              int gmx_unused                          rank,
+              /* true if both local and non-local are done on GPU */
+              gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
  
  /** Initializes pair-list data for GPU, called at every pair search step. */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                             const struct NbnxnPairlistGpu gmx_unused *h_nblist,
-                             int                    gmx_unused         iloc) GPU_FUNC_TERM
+void gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                       const struct NbnxnPairlistGpu gmx_unused *h_nblist,
+                       InteractionLocality    gmx_unused         iloc) GPU_FUNC_TERM
  
  /** Initializes atom-data on the GPU, called at every pair search step. */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                             const nbnxn_atomdata_t gmx_unused        *nbat) GPU_FUNC_TERM
+void gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                       const nbnxn_atomdata_t gmx_unused        *nbat) GPU_FUNC_TERM
  
  /*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the
   *  electrostatic type switching to twin cut-off (or back) if needed.
   */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
-                                        const interaction_const_t gmx_unused       *ic,
-                                        const NbnxnListParameters gmx_unused       *listParams) GPU_FUNC_TERM
+void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
+                                  const interaction_const_t gmx_unused       *ic,
+                                  const NbnxnListParameters gmx_unused       *listParams) GPU_FUNC_TERM
  
  /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                               const nbnxn_atomdata_t gmx_unused        *nbatom) GPU_FUNC_TERM
+void gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                         const nbnxn_atomdata_t gmx_unused        *nbatom) GPU_FUNC_TERM
  
  /** Clears GPU outputs: nonbonded force, shift force and energy. */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
-                             int              gmx_unused flags) GPU_FUNC_TERM
+void gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
+                       int              gmx_unused flags) GPU_FUNC_TERM
  
  /** Frees all GPU resources used for the nonbonded calculations. */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
+void gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
  
  /** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
  GPU_FUNC_QUALIFIER
-struct gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
+struct gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
  
  /** Resets nonbonded GPU timings. */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
+void gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
  
  /** Calculates the minimum size of proximity lists to improve SM load balance
   *  with GPU non-bonded kernels. */
-GPU_FUNC_QUALIFIER
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
+     GPU_FUNC_QUALIFIER
+int gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
  
  /** Returns if analytical Ewald GPU kernels are used. */
  GPU_FUNC_QUALIFIER
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
  
  /** Returns an opaque pointer to the GPU command stream
   *  Note: CUDA only.
   */
  CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused *nb,
-                                   int gmx_unused              iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused     *nb,
+                             InteractionLocality gmx_unused  iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
  
  /** Returns an opaque pointer to the GPU coordinate+charge array
   *  Note: CUDA only.
   */
  CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
  
  /** Returns an opaque pointer to the GPU force array
   *  Note: CUDA only.
   */
  CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
  
  /** Returns an opaque pointer to the GPU shift force array
   *  Note: CUDA only.
   */
  CUDA_FUNC_QUALIFIER
-rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+     rvec *gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+
+} // namespace Nbnxm
  
  #endif
diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h

index 78893704bb200ea552a567e22bd96a2864455679..4c57f22bcd301b3dcdbb19c7201e14f840b6007a 100644 (file)
--- a/src/gromacs/nbnxm/gpu_types_common.h
+++ b/src/gromacs/nbnxm/gpu_types_common.h
@@ -45,6 +45,9 @@
  #include "config.h"
  
  #include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/utility/enumerationhelpers.h"
+
+#include "locality.h"
  
  #if GMX_GPU == GMX_GPU_OPENCL
  #include "gromacs/gpu_utils/gpuregiontimer_ocl.h"
@@ -54,24 +57,46 @@
  #include "gromacs/gpu_utils/gpuregiontimer.cuh"
  #endif
  
+namespace Nbnxm
+{
+
  /*! \internal
   * \brief GPU region timers used for timing GPU kernels and H2D/D2H transfers.
   *
   * The two-sized arrays hold the local and non-local values and should always
   * be indexed with eintLocal/eintNonlocal.
   */
-struct nbnxn_gpu_timers_t
+struct gpu_timers_t
  {
-    GpuRegionTimer atdat;              /**< timer for atom data transfer (every PS step)            */
-    GpuRegionTimer nb_h2d[2];          /**< timer for x/q H2D transfers (l/nl, every step)          */
-    GpuRegionTimer nb_d2h[2];          /**< timer for f D2H transfer (l/nl, every step)             */
-    GpuRegionTimer pl_h2d[2];          /**< timer for pair-list H2D transfers (l/nl, every PS step) */
-    bool           didPairlistH2D[2];  /**< true when a pair-list transfer has been done at this step */
-    GpuRegionTimer nb_k[2];            /**< timer for non-bonded kernels (l/nl, every step)         */
-    GpuRegionTimer prune_k[2];         /**< timer for the 1st pass list pruning kernel (l/nl, every PS step)   */
-    bool           didPrune[2];        /**< true when we timed pruning and the timings need to be accounted for */
-    GpuRegionTimer rollingPrune_k[2];  /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size)  */
-    bool           didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+    /*! \internal
+     * \brief Timers for local or non-local coordinate/force transfers
+     */
+    struct XFTransfers
+    {
+        GpuRegionTimer nb_h2d; /**< timer for x/q H2D transfers (l/nl, every step) */
+        GpuRegionTimer nb_d2h; /**< timer for f D2H transfer (l/nl, every step) */
+    };
+
+    /*! \internal
+     * \brief Timers for local or non-local interaction related operations
+     */
+    struct Interaction
+    {
+        GpuRegionTimer pl_h2d;                  /**< timer for pair-list H2D transfers (l/nl, every PS step) */
+        bool           didPairlistH2D = false;  /**< true when a pair-list transfer has been done at this step */
+        GpuRegionTimer nb_k;                    /**< timer for non-bonded kernels (l/nl, every step)         */
+        GpuRegionTimer prune_k;                 /**< timer for the 1st pass list pruning kernel (l/nl, every PS step)   */
+        bool           didPrune = false;        /**< true when we timed pruning and the timings need to be accounted for */
+        GpuRegionTimer rollingPrune_k;          /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size)  */
+        bool           didRollingPrune = false; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+    };
+
+    //! timer for atom data transfer (every PS step)
+    GpuRegionTimer                                                               atdat;
+    //! timers for coordinate/force transfers (every step)
+    gmx::EnumerationArray<AtomLocality, XFTransfers>                             xf;
+    //! timers for interaction related transfers
+    gmx::EnumerationArray<InteractionLocality, Nbnxm::gpu_timers_t::Interaction> interaction;
  };
  
  struct gpu_plist
@@ -99,4 +124,6 @@ struct gpu_plist
      int              rollingPruningPart;     /**< the next part to which the roling pruning needs to be applied */
  };
  
+} // namespace Nbnxm
+
  #endif
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp

index 276a2e69e3485d87c91d71da0374439d5c2fb300..14402aca91c080e2f080d9c2db47e39b9e3d3359 100644 (file)
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -413,13 +413,13 @@ nbnxn_kernel_cpu(const nonbonded_verlet_group_t *nbvg,
      }
  }
  
-static void accountFlops(t_nrnb                    *nrnb,
-                         const nonbonded_verlet_t  &nbv,
-                         const int                  ilocality,
-                         const interaction_const_t &ic,
-                         int                        forceFlags)
+static void accountFlops(t_nrnb                           *nrnb,
+                         const nonbonded_verlet_t         &nbv,
+                         const Nbnxm::InteractionLocality  iLocality,
+                         const interaction_const_t        &ic,
+                         const int                         forceFlags)
  {
-    const nonbonded_verlet_group_t &nbvg            = nbv.grp[ilocality];
+    const nonbonded_verlet_group_t &nbvg            = nbv.grp[iLocality];
      const bool                      usingGpuKernels = (nbvg.kernel_type == nbnxnk8x8x8_GPU);
  
      int enr_nbnxn_kernel_ljc;
@@ -428,7 +428,7 @@ static void accountFlops(t_nrnb                    *nrnb,
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
      }
      else if ((!usingGpuKernels && nbvg.ewald_excl == ewaldexclAnalytical) ||
-             (usingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
+             (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
      {
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
      }
@@ -474,7 +474,7 @@ static void accountFlops(t_nrnb                    *nrnb,
  }
  
  void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
-                         const int                  ilocality,
+                         Nbnxm::InteractionLocality iLocality,
                           const interaction_const_t &ic,
                           int                        forceFlags,
                           int                        clearF,
@@ -482,7 +482,7 @@ void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
                           gmx_enerdata_t            *enerd,
                           t_nrnb                    *nrnb)
  {
-    const nonbonded_verlet_group_t &nbvg = nbv->grp[ilocality];
+    const nonbonded_verlet_group_t &nbvg = nbv->grp[iLocality];
  
      switch (nbvg.kernel_type)
      {
@@ -503,7 +503,7 @@ void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
              break;
  
          case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, forceFlags, ilocality);
+            Nbnxm::gpu_launch_kernel(nbv->gpu_nbv, forceFlags, iLocality);
              break;
  
          case nbnxnk8x8x8_PlainC:
@@ -525,5 +525,5 @@ void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
  
      }
  
-    accountFlops(nrnb, *nbv, ilocality, ic, forceFlags);
+    accountFlops(nrnb, *nbv, iLocality, ic, forceFlags);
  }
diff --git a/src/gromacs/nbnxm/locality.h b/src/gromacs/nbnxm/locality.h

new file mode 100644 (file)

index 0000000..6ebd4fb
--- /dev/null
+++ b/src/gromacs/nbnxm/locality.h
@@ -0,0 +1,76 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines nbnxn locality enums
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
+
+#ifndef GMX_NBNXM_LOCALITY_H
+#define GMX_NBNXM_LOCALITY_H
+
+namespace Nbnxm
+{
+
+/*! \brief Atom locality indicator: local, non-local, all.
+ *
+ * Used for calls to:
+ * gridding, force calculation, x/f buffer operations
+ */
+enum class AtomLocality : int
+{
+    Local    = 0, //!< Local atoms
+    NonLocal = 1, //!< Non-local atoms
+    All      = 2, //!< Both local and non-local atoms
+    Count    = 3  //!< The number of atom locality types
+};
+
+/*! \brief Interaction locality indicator: local, non-local, all.
+ *
+ * Used for calls to:
+ * pair-search, force calculation, x/f buffer operations
+ */
+enum class InteractionLocality : int
+{
+    Local    = 0, //!< Interactions between local atoms only
+    NonLocal = 1, //!< Interactions between non-local and (non-)local atoms
+    Count    = 2  //!< The number of interaction locality types
+};
+
+}      // namespace Nbnxm
+
+#endif // GMX_NBNXM_LOCALITY_H
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 59886a5f8bad21254a51eba3e447cb3d1bfa97a3..9e0138e162a22273c50dfd2b916e79a715e69abd 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -104,8 +104,11 @@
  #include "gromacs/nbnxm/pairlist.h"
  #include "gromacs/nbnxm/pairlistset.h"
  #include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/enumerationhelpers.h"
  #include "gromacs/utility/real.h"
  
+#include "locality.h"
+
  // TODO: Remove this include and the two nbnxm includes above
  #include "nbnxm_gpu.h"
  
@@ -148,50 +151,23 @@ typedef enum
      nbnxnkNR
  } nbnxn_kernel_type;
  
+namespace Nbnxm
+{
+
  /*! \brief Return a string identifying the kernel type.
   *
   * \param [in] kernel_type   nonbonded kernel types, takes values from the nbnxn_kernel_type enum
   * \returns                  a string identifying the kernel corresponding to the type passed as argument
   */
-const char *lookup_nbnxn_kernel_name(int kernel_type);
+const char *lookup_kernel_name(int kernel_type);
+
+} // namespace Nbnxm
  
  /*! \brief Ewald exclusion types */
  enum {
      ewaldexclTable, ewaldexclAnalytical
  };
  
-/*! \brief Atom locality indicator: local, non-local, all.
- *
- * Used for calls to:
- * gridding, pair-search, force calculation, x/f buffer operations
- * */
-enum {
-    eatLocal = 0, eatNonlocal = 1, eatAll
-};
-
-/*! \brief Tests for local atom range */
-#define LOCAL_A(x)               ((x) == eatLocal)
-/*! \brief Tests for non-local atom range */
-#define NONLOCAL_A(x)            ((x) == eatNonlocal)
-/*! \brief Tests for either local or non-local atom range */
-#define LOCAL_OR_NONLOCAL_A(x)   (LOCAL_A(x) || NONLOCAL_A(x))
-
-/*! \brief Interaction locality indicator
- *
- * Used in pair-list search/calculations in the following manner:
- *  - local interactions require local atom data and affect local output only;
- *  - non-local interactions require both local and non-local atom data and
- *    affect both local- and non-local output.
- */
-enum {
-    eintLocal = 0, eintNonlocal = 1
-};
-
-/*! \brief Tests for local interaction indicator */
-#define LOCAL_I(x)               ((x) == eintLocal)
-/*! \brief Tests for non-local interaction indicator */
-#define NONLOCAL_I(x)            ((x) == eintNonlocal)
-
  /*! \brief Flag to tell the nonbonded kernels whether to clear the force output buffers */
  enum {
      enbvClearFNo, enbvClearFYes
@@ -207,19 +183,25 @@ typedef struct nonbonded_verlet_group_t {
  
  /*! \libinternal
   *  \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */
-typedef struct nonbonded_verlet_t {
-    std::unique_ptr<NbnxnListParameters> listParams;      /**< Parameters for the search and list pruning setup */
-    std::unique_ptr<nbnxn_search>        nbs;             /**< n vs n atom pair searching data       */
-    int                                  ngrp;            /**< number of interaction groups          */
-    nonbonded_verlet_group_t             grp[2];          /**< local and non-local interaction group */
-    nbnxn_atomdata_t                    *nbat;            /**< atom data                             */
-
-    gmx_bool                             bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
-    EmulateGpuNonbonded                  emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
-    gmx_nbnxn_gpu_t                     *gpu_nbv;         /**< pointer to GPU nb verlet data     */
-    int                                  min_ci_balanced; /**< pair list balancing parameter
-                                                               used for the 8x8x8 GPU kernels    */
-} nonbonded_verlet_t;
+struct nonbonded_verlet_t
+{
+    std::unique_ptr<NbnxnListParameters>                                        listParams; /**< Parameters for the search and list pruning setup */
+    std::unique_ptr<nbnxn_search>                                               nbs;        /**< n vs n atom pair searching data       */
+    int                                                                         ngrp;       /**< number of interaction groups          */
+    //! Local and non-local interaction group
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, nonbonded_verlet_group_t> grp;
+    //! Atom data
+    nbnxn_atomdata_t                                                           *nbat;
+
+    gmx_bool                                                                    bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
+    EmulateGpuNonbonded                                                         emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
+    gmx_nbnxn_gpu_t                                                            *gpu_nbv;         /**< pointer to GPU nb verlet data     */
+    int                                                                         min_ci_balanced; /**< pair list balancing parameter
+                                                                                                      used for the 8x8x8 GPU kernels    */
+};
+
+namespace Nbnxm
+{
  
  /*! \brief Initializes the nbnxn module */
  void init_nb_verlet(const gmx::MDLogger     &mdlog,
@@ -233,6 +215,8 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
                      const gmx_mtop_t        *mtop,
                      matrix                   box);
  
+} // namespace Nbnxm
+
  /*! \brief Put the atoms on the pair search grid.
   *
   * Only atoms atomStart to atomEnd in x are put on the grid.
@@ -290,18 +274,18 @@ gmx::ArrayRef<const int> nbnxn_get_gridindices(const nbnxn_search* nbs);
   * pairs beyond the pairlist inner radius and writes the result to a list that is
   * to be consumed by the non-bonded kernel.
   */
-void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
-                              int                 ilocality,
-                              const rvec         *shift_vec);
+void NbnxnDispatchPruneKernel(nonbonded_verlet_t         *nbv,
+                              Nbnxm::InteractionLocality  iLocality,
+                              const rvec                 *shift_vec);
  
  /*! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU */
-void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
-                         int                        ilocality,
-                         const interaction_const_t &ic,
-                         int                        forceFlags,
-                         int                        clearF,
-                         t_forcerec                *fr,
-                         gmx_enerdata_t            *enerd,
-                         t_nrnb                    *nrnb);
+void NbnxnDispatchKernel(nonbonded_verlet_t         *nbv,
+                         Nbnxm::InteractionLocality  iLocality,
+                         const interaction_const_t  &ic,
+                         int                         forceFlags,
+                         int                         clearF,
+                         t_forcerec                 *fr,
+                         gmx_enerdata_t             *enerd,
+                         t_nrnb                     *nrnb);
  
  #endif // GMX_NBNXN_NBNXN_H
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 6aa7c21752129bfad7a344a16c0b3dc261469993..8e5f77a9af48d89a82f570e47e1da4bd4fd9b01b 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -49,10 +49,14 @@
  #include "gromacs/utility/real.h"
  
  #include "gpu_types.h"
+#include "locality.h"
  
  struct nbnxn_atomdata_t;
  enum class GpuTaskCompletion;
  
+namespace Nbnxm
+{
+
  /*! \brief
   * Launch asynchronously the xq buffer host to device copy.
   *
@@ -61,14 +65,14 @@ enum class GpuTaskCompletion;
   *
   * \param [in]    nb        GPU nonbonded data.
   * \param [in]    nbdata    Host-side atom data structure.
- * \param [in]    iloc      Interaction locality flag.
+ * \param [in]    aloc      Atom locality flag.
   * \param [in]    haveOtherWork  True if there are other tasks that require the nbnxn coordinate input.
   */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                              const struct nbnxn_atomdata_t gmx_unused *nbdata,
-                              int gmx_unused                            iloc,
-                              bool gmx_unused                           haveOtherWork) GPU_FUNC_TERM
+void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                        const struct nbnxn_atomdata_t gmx_unused *nbdata,
+                        AtomLocality gmx_unused                   aloc,
+                        bool gmx_unused                           haveOtherWork) GPU_FUNC_TERM
  
  /*! \brief
   * Launch asynchronously the nonbonded force calculations.
@@ -81,9 +85,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
   *
   */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                             int gmx_unused                            flags,
-                             int gmx_unused                            iloc) GPU_FUNC_TERM
+void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused     *nb,
+                       int gmx_unused                  flags,
+                       InteractionLocality gmx_unused  iloc) GPU_FUNC_TERM
  
  /*! \brief
   * Launch asynchronously the nonbonded prune-only kernel.
@@ -121,9 +125,9 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused               *nb,
   * \param [in]    numParts  Number of parts the pair list is split into in the rolling kernel.
   */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
-                                       int gmx_unused              iloc,
-                                       int gmx_unused              numParts) GPU_FUNC_TERM
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused     *nb,
+                                 InteractionLocality gmx_unused  iloc,
+                                 int gmx_unused                  numParts) GPU_FUNC_TERM
  
  /*! \brief
   * Launch asynchronously the download of nonbonded forces from the GPU
@@ -132,11 +136,11 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
   * no non-bonded work.
   */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
-                              struct nbnxn_atomdata_t gmx_unused       *nbatom,
-                              int                    gmx_unused         flags,
-                              int                    gmx_unused         aloc,
-                              bool                   gmx_unused         haveOtherWork) GPU_FUNC_TERM
+void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
+                        nbnxn_atomdata_t gmx_unused *nbatom,
+                        int              gmx_unused  flags,
+                        AtomLocality     gmx_unused  aloc,
+                        bool             gmx_unused  haveOtherWork) GPU_FUNC_TERM
  
  /*! \brief Attempts to complete nonbonded GPU task.
   *
@@ -172,14 +176,14 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
   * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
   */
  GPU_FUNC_QUALIFIER
-bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
-                               int             gmx_unused   flags,
-                               int             gmx_unused   aloc,
-                               bool            gmx_unused   haveOtherWork,
-                               real            gmx_unused  *e_lj,
-                               real            gmx_unused  *e_el,
-                               rvec            gmx_unused  *fshift,
-                               GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
+                         int             gmx_unused   flags,
+                         AtomLocality    gmx_unused   aloc,
+                         bool            gmx_unused   haveOtherWork,
+                         real            gmx_unused  *e_lj,
+                         real            gmx_unused  *e_el,
+                         rvec            gmx_unused  *fshift,
+                         GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
  
  /*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
   * transfers to finish.
@@ -197,16 +201,18 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
   * \param[out] fshift Pointer to the shift force buffer to accumulate into
   */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
-                                int             gmx_unused  flags,
-                                int             gmx_unused  aloc,
-                                bool            gmx_unused  haveOtherWork,
-                                real            gmx_unused *e_lj,
-                                real            gmx_unused *e_el,
-                                rvec            gmx_unused *fshift) GPU_FUNC_TERM
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+                          int             gmx_unused  flags,
+                          AtomLocality    gmx_unused  aloc,
+                          bool            gmx_unused  haveOtherWork,
+                          real            gmx_unused *e_lj,
+                          real            gmx_unused *e_el,
+                          rvec            gmx_unused *fshift) GPU_FUNC_TERM
  
  /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
  GPU_FUNC_QUALIFIER
-int nbnxn_gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
+int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+} // namespace Nbnxm
  
  #endif
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp

index d4dafe11c7099e6b2b2a91f25ce25cca5ca94308..0565d3faf0b11854e38d35974d713939415b39e3 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -62,6 +62,9 @@
  #include "grid.h"
  #include "internal.h"
  
+namespace Nbnxm
+{
+
  /*! \brief Returns whether CPU SIMD support exists for the given inputrec
   *
   * If the return value is FALSE and fplog/cr != NULL, prints a fallback
@@ -134,7 +137,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused    *ir,
              /* One 256-bit FMA per cycle makes 2xNN faster */
              *kernel_type = nbnxnk4xN_SIMD_2xNN;
          }
-#endif  /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
+#endif      /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
  
  
          if (getenv("GMX_NBNXN_SIMD_4XN") != nullptr)
@@ -181,10 +184,10 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused    *ir,
          }
  
      }
-#endif // GMX_SIMD
+#endif  // GMX_SIMD
  }
  
-const char *lookup_nbnxn_kernel_name(int kernel_type)
+const char *lookup_kernel_name(int kernel_type)
  {
      const char *returnvalue = nullptr;
      switch (kernel_type)
@@ -260,7 +263,7 @@ static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog,
      {
          GMX_LOG(mdlog.info).asParagraph().appendTextFormatted(
                  "Using %s %dx%d nonbonded short-range kernels",
-                lookup_nbnxn_kernel_name(*kernel_type),
+                lookup_kernel_name(*kernel_type),
                  nbnxn_kernel_to_cluster_i_size(*kernel_type),
                  nbnxn_kernel_to_cluster_j_size(*kernel_type));
  
@@ -270,7 +273,7 @@ static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog,
              GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
                      "WARNING: Using the slow %s kernels. This should\n"
                      "not happen during routine usage on supported platforms.",
-                    lookup_nbnxn_kernel_name(*kernel_type));
+                    lookup_kernel_name(*kernel_type));
          }
      }
  }
@@ -388,13 +391,13 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
      {
          /* init the NxN GPU data; the last argument tells whether we'll have
           * both local and non-local NB calculation on GPU */
-        nbnxn_gpu_init(&nbv->gpu_nbv,
-                       deviceInfo,
-                       fr->ic,
-                       nbv->listParams.get(),
-                       nbv->nbat,
-                       cr->nodeid,
-                       (nbv->ngrp > 1));
+        gpu_init(&nbv->gpu_nbv,
+                 deviceInfo,
+                 fr->ic,
+                 nbv->listParams.get(),
+                 nbv->nbat,
+                 cr->nodeid,
+                 (nbv->ngrp > 1));
  
          if ((env = getenv("GMX_NB_MIN_CI")) != nullptr)
          {
@@ -414,7 +417,7 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
          }
          else
          {
-            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+            nbv->min_ci_balanced = gpu_min_ci_balanced(nbv->gpu_nbv);
              if (debug)
              {
                  fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
@@ -426,3 +429,5 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
  
      *nb_verlet = nbv;
  }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 8aac82d3d34f4253e0a7d69a3ce77b41b34e32bd..34132dbbd753ed7a9a9737ea10891d3d578bfbe9 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -88,6 +88,9 @@
  #include "nbnxm_ocl_internal.h"
  #include "nbnxm_ocl_types.h"
  
+namespace Nbnxm
+{
+
  /*! \brief Convenience constants */
  //@{
  static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
@@ -359,19 +362,22 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
  }
  
  /*! \brief Launch asynchronously the xq buffer host to device copy. */
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
-                              const nbnxn_atomdata_t *nbatom,
-                              int                     iloc,
-                              bool                    haveOtherWork)
+void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
+                        const nbnxn_atomdata_t *nbatom,
+                        const AtomLocality      atomLocality,
+                        const bool              haveOtherWork)
  {
-    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+
+    /* local/nonlocal offset and length used for xq and f */
+    int                  adat_begin, adat_len;
  
      cl_atomdata_t       *adat    = nb->atdat;
      cl_plist_t          *plist   = nb->plist[iloc];
      cl_timers_t         *t       = nb->timers;
      cl_command_queue     stream  = nb->stream[iloc];
  
-    bool                 bDoTime     = (nb->bDoTime) != 0;
+    bool                 bDoTime = (nb->bDoTime) != 0;
  
      /* Don't launch the non-local H2D copy if there is no dependent
         work to do: neither non-local nor other (e.g. bonded) work
@@ -382,7 +388,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
         we always call the local local x+q copy (and the rest of the local
         work in nbnxn_gpu_launch_kernel().
       */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -390,7 +396,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
      }
  
      /* calculate the atom data index range based on locality */
-    if (LOCAL_I(iloc))
+    if (atomLocality == AtomLocality::Local)
      {
          adat_begin  = 0;
          adat_len    = adat->natoms_local;
@@ -404,16 +410,16 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
      /* beginning of timed HtoD section */
      if (bDoTime)
      {
-        t->nb_h2d[iloc].openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
      }
  
      /* HtoD x, q */
      ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4,
-                       adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
+                       adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
-        t->nb_h2d[iloc].closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
      }
  
      /* When we get here all misc operations issues in the local stream as well as
@@ -421,7 +427,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
         so we record that in the local stream and wait for it in the nonlocal one. */
      if (nb->bUseTwoStreams)
      {
-        if (iloc == eintLocal)
+        if (iloc == InteractionLocality::Local)
          {
              cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
              assert(CL_SUCCESS == cl_error);
@@ -459,9 +465,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
     misc_ops_done event to record the point in time when the above  operations
     are finished and synchronize with this event in the non-local stream.
   */
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
-                             int                            flags,
-                             int                            iloc)
+void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
+                       const int                         flags,
+                       const Nbnxm::InteractionLocality  iloc)
  {
      /* OpenCL kernel launch-related stuff */
      cl_kernel            nb_kernel = nullptr;  /* fn pointer to the nonbonded kernel */
@@ -487,7 +493,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
         clearing. All these operations, except for the local interaction kernel,
         are needed for the non-local interactions. The skip of the local kernel
         call is taken care of later in this function. */
-    if (canSkipWork(nb, iloc))
+    if (canSkipWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -500,7 +506,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
             (that's the way the timing accounting can distinguish between
             separate prune kernel and combined force+prune).
           */
-        nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
+        Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
      }
  
      if (plist->nsci == 0)
@@ -513,7 +519,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
      /* beginning of timed nonbonded calculation section */
      if (bDoTime)
      {
-        t->nb_k[iloc].openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(stream);
      }
  
      /* get the pointer to the kernel flavor we need to use */
@@ -521,7 +527,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
                                      nbp->eeltype,
                                      nbp->vdwtype,
                                      bCalcEner,
-                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]));
+                                    (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
  
      /* kernel launch config */
  
@@ -545,7 +551,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
  
      fillin_ocl_structures(nbp, &nbparams_params);
  
-    auto          *timingEvent  = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+    auto          *timingEvent  = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
      constexpr char kernelName[] = "k_calc_nb";
      if (useLjCombRule(nb->nbparam->vdwtype))
      {
@@ -570,7 +576,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
  
      if (bDoTime)
      {
-        t->nb_k[iloc].closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(stream);
      }
  }
  
@@ -600,9 +606,9 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
      return shmem;
  }
  
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
-                                       int                    iloc,
-                                       int                    numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
+                                 const InteractionLocality  iloc,
+                                 const int                  numParts)
  {
      cl_atomdata_t       *adat    = nb->atdat;
      cl_nbparam_t        *nbp     = nb->nbparam;
@@ -656,7 +662,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
      GpuRegionTimer *timer = nullptr;
      if (bDoTime)
      {
-        timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
      }
  
      /* beginning of timed prune calculation section */
@@ -708,12 +714,12 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
      {
          plist->haveFreshList         = false;
          /* Mark that pruning has been done */
-        nb->timers->didPrune[iloc] = true;
+        nb->timers->interaction[iloc].didPrune = true;
      }
      else
      {
          /* Mark that rolling pruning has been done */
-        nb->timers->didRollingPrune[iloc] = true;
+        nb->timers->interaction[iloc].didRollingPrune = true;
      }
  
      if (bDoTime)
@@ -726,29 +732,29 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
   * Launch asynchronously the download of nonbonded forces from the GPU
   * (and energies/shift forces if required).
   */
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
-                              struct nbnxn_atomdata_t       *nbatom,
-                              int                            flags,
-                              int                            aloc,
-                              bool                           haveOtherWork)
+void gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
+                        struct nbnxn_atomdata_t       *nbatom,
+                        const int                      flags,
+                        const AtomLocality             aloc,
+                        const bool                     haveOtherWork)
  {
      cl_int gmx_unused cl_error;
      int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
  
      /* determine interaction locality from atom locality */
-    int              iloc = gpuAtomToInteractionLocality(aloc);
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
  
-    cl_atomdata_t   *adat    = nb->atdat;
-    cl_timers_t     *t       = nb->timers;
-    bool             bDoTime = nb->bDoTime == CL_TRUE;
-    cl_command_queue stream  = nb->stream[iloc];
+    cl_atomdata_t            *adat    = nb->atdat;
+    cl_timers_t              *t       = nb->timers;
+    bool                      bDoTime = nb->bDoTime == CL_TRUE;
+    cl_command_queue          stream  = nb->stream[iloc];
  
-    bool             bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
-    int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                      bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
+    int                       bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
  
      /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
      {
          /* TODO An alternative way to signal that non-local work is
             complete is to use a clEnqueueMarker+clEnqueueBarrier
@@ -767,19 +773,19 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
      /* beginning of timed D2H section */
      if (bDoTime)
      {
-        t->nb_d2h[iloc].openTimingRegion(stream);
+        t->xf[aloc].nb_d2h.openTimingRegion(stream);
      }
  
      /* With DD the local D2H transfer can only start after the non-local
         has been launched. */
-    if (iloc == eintLocal && nb->bNonLocalStreamActive)
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
      {
          sync_ocl_event(stream, &(nb->nonlocal_done));
      }
  
      /* DtoH f */
      ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
-                       (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                       (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
  
      /* kick off work */
      cl_error = clFlush(stream);
@@ -789,7 +795,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
         recorded which signals that the local D2H can proceed. This event is not
         placed after the non-local kernel because we first need the non-local
         data back first. */
-    if (iloc == eintNonlocal)
+    if (iloc == InteractionLocality::NonLocal)
      {
          cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
          assert(CL_SUCCESS == cl_error);
@@ -797,35 +803,35 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
      }
  
      /* only transfer energies in the local stream */
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
      {
          /* DtoH fshift */
          if (bCalcFshift)
          {
              ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
-                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
          }
  
          /* DtoH energies */
          if (bCalcEner)
          {
              ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
-                               sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
  
              ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
-                               sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
          }
      }
  
      if (bDoTime)
      {
-        t->nb_d2h[iloc].closeTimingRegion(stream);
+        t->xf[aloc].nb_d2h.closeTimingRegion(stream);
      }
  }
  
  
  /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
-int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
+int gpu_pick_ewald_kernel_type(const bool bTwinCut)
  {
      bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
      int  kernel_type;
@@ -879,3 +885,5 @@ int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
  
      return kernel_type;
  }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index 27a2cd806e103128592236943a20c22cd05745d9..72be7654413da5ea4753ca1e78cd9b474cab67a4 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -50,6 +50,11 @@
  
  #include <cmath>
  
+// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+//      is currently declared means this has to be before gpu_types.h
+#include "nbnxm_ocl_types.h"
+
+// TODO Remove this comment when the above order issue is resolved
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/gpu_utils/oclutils.h"
  #include "gromacs/hardware/gpu_hw_info.h"
@@ -70,7 +75,9 @@
  #include "gromacs/utility/smalloc.h"
  
  #include "nbnxm_ocl_internal.h"
-#include "nbnxm_ocl_types.h"
+
+namespace Nbnxm
+{
  
  /*! \brief This parameter should be determined heuristically from the
   * kernel execution times
@@ -276,7 +283,7 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
      else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
      {
          /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
-        *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
+        *gpu_eeltype = gpu_pick_ewald_kernel_type(false);
      }
      else
      {
@@ -398,11 +405,11 @@ static void init_nbparam(cl_nbparam_t                    *nbp,
  }
  
  //! This function is documented in the header file
-void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic,
-                                        const NbnxnListParameters   *listParams)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                  const interaction_const_t   *ic,
+                                  const NbnxnListParameters   *listParams)
  {
-    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+    if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU)
      {
          return;
      }
@@ -411,7 +418,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
  
      set_cutoff_parameters(nbp, ic, listParams);
  
-    nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
+    nbp->eeltype = gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
  
      init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
  }
@@ -440,19 +447,6 @@ static void init_plist(cl_plist_t *pl)
      pl->haveFreshList  = false;
  }
  
-/*! \brief Initializes the timer data structure.
- */
-static void init_timers(cl_timers_t *t,
-                        bool         bUseTwoStreams)
-{
-    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
-    {
-        t->didPairlistH2D[i]  = false;
-        t->didPrune[i]        = false;
-        t->didRollingPrune[i] = false;
-    }
-}
-
  /*! \brief Initializes the timings data structure.
   */
  static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
@@ -547,7 +541,7 @@ nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
  
      cl_int               cl_error;
      cl_atomdata_t *      adat     = nb->atdat;
-    cl_command_queue     ls       = nb->stream[eintLocal];
+    cl_command_queue     ls       = nb->stream[InteractionLocality::Local];
  
      size_t               local_work_size[3]   = {1, 1, 1};
      size_t               global_work_size[3]  = {1, 1, 1};
@@ -613,13 +607,13 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
  
  
  //! This function is documented in the header file
-void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
-                    const gmx_device_info_t   *deviceInfo,
-                    const interaction_const_t *ic,
-                    const NbnxnListParameters *listParams,
-                    const nbnxn_atomdata_t    *nbat,
-                    int                        rank,
-                    gmx_bool                   bLocalAndNonlocal)
+void gpu_init(gmx_nbnxn_ocl_t          **p_nb,
+              const gmx_device_info_t   *deviceInfo,
+              const interaction_const_t *ic,
+              const NbnxnListParameters *listParams,
+              const nbnxn_atomdata_t    *nbat,
+              const int                  rank,
+              const gmx_bool             bLocalAndNonlocal)
  {
      gmx_nbnxn_ocl_t            *nb;
      cl_int                      cl_error;
@@ -635,10 +629,10 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
      snew(nb, 1);
      snew(nb->atdat, 1);
      snew(nb->nbparam, 1);
-    snew(nb->plist[eintLocal], 1);
+    snew(nb->plist[InteractionLocality::Local], 1);
      if (bLocalAndNonlocal)
      {
-        snew(nb->plist[eintNonlocal], 1);
+        snew(nb->plist[InteractionLocality::NonLocal], 1);
      }
  
      nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
@@ -655,7 +649,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
      pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
      pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
  
-    init_plist(nb->plist[eintLocal]);
+    init_plist(nb->plist[InteractionLocality::Local]);
  
      /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
      nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
@@ -673,7 +667,8 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
      nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
  
      /* local/non-local GPU streams */
-    nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+    nb->stream[InteractionLocality::Local] =
+        clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
      if (CL_SUCCESS != cl_error)
      {
          gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -684,9 +679,10 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
  
      if (nb->bUseTwoStreams)
      {
-        init_plist(nb->plist[eintNonlocal]);
+        init_plist(nb->plist[InteractionLocality::NonLocal]);
  
-        nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+        nb->stream[InteractionLocality::NonLocal] =
+            clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
          if (CL_SUCCESS != cl_error)
          {
              gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -698,7 +694,6 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
  
      if (nb->bDoTime)
      {
-        init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE);
          init_timings(nb->timings);
      }
  
@@ -742,7 +737,7 @@ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
      cl_int gmx_used_in_debug cl_error;
  
      cl_atomdata_t           *atomData = nb->atdat;
-    cl_command_queue         ls       = nb->stream[eintLocal];
+    cl_command_queue         ls       = nb->stream[InteractionLocality::Local];
      cl_float                 value    = 0.0f;
  
      cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
@@ -753,8 +748,8 @@ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
  
  //! This function is documented in the header file
  void
-nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
-                        int                flags)
+gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
+                  const int          flags)
  {
      nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
      /* clear shift force array and energies if the outputs were
@@ -766,14 +761,14 @@ nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
  
      /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
      cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->stream[eintLocal]);
+    cl_error = clFlush(nb->stream[InteractionLocality::Local]);
      assert(CL_SUCCESS == cl_error);
  }
  
  //! This function is documented in the header file
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
-                             const NbnxnPairlistGpu *h_plist,
-                             int                     iloc)
+void gpu_init_pairlist(gmx_nbnxn_ocl_t           *nb,
+                       const NbnxnPairlistGpu    *h_plist,
+                       const InteractionLocality  iloc)
  {
      char             sbuf[STRLEN];
      // Timing accumulation should happen only if there was work to do
@@ -797,10 +792,12 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
          }
      }
  
+    gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+
      if (bDoTime)
      {
-        nb->timers->pl_h2d[iloc].openTimingRegion(stream);
-        nb->timers->didPairlistH2D[iloc] = true;
+        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.didPairlistH2D = true;
      }
  
      // TODO most of this function is same in CUDA and OpenCL, move into the header
@@ -810,13 +807,13 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
                             &d_plist->nsci, &d_plist->sci_nalloc, context);
      copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
                         stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
                             &d_plist->ncj4, &d_plist->cj4_nalloc, context);
      copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
                         stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
                             &d_plist->nimask, &d_plist->imask_nalloc, context);
@@ -825,11 +822,11 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
                             &d_plist->nexcl, &d_plist->excl_nalloc, context);
      copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
                         stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
-        nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(stream);
      }
  
      /* need to prune the pair list during the next step */
@@ -837,11 +834,11 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
  }
  
  //! This function is documented in the header file
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
-                               const nbnxn_atomdata_t *nbatom)
+void gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
+                         const nbnxn_atomdata_t *nbatom)
  {
      cl_atomdata_t   *adat  = nb->atdat;
-    cl_command_queue ls    = nb->stream[eintLocal];
+    cl_command_queue ls    = nb->stream[InteractionLocality::Local];
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -853,8 +850,8 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
  }
  
  //! This function is documented in the header file
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
-                             const nbnxn_atomdata_t        *nbat)
+void gpu_init_atomdata(gmx_nbnxn_ocl_t        *nb,
+                       const nbnxn_atomdata_t *nbat)
  {
      cl_int           cl_error;
      int              nalloc, natoms;
@@ -862,7 +859,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
      bool             bDoTime = nb->bDoTime == CL_TRUE;
      cl_timers_t     *timers  = nb->timers;
      cl_atomdata_t   *d_atdat = nb->atdat;
-    cl_command_queue ls      = nb->stream[eintLocal];
+    cl_command_queue ls      = nb->stream[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
@@ -1011,7 +1008,7 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
  }
  
  //! This function is documented in the header file
-void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
+void gpu_free(gmx_nbnxn_ocl_t *nb)
  {
      if (nb == nullptr)
      {
@@ -1051,7 +1048,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
      sfree(nb->nbparam);
  
      /* Free plist */
-    auto *plist = nb->plist[eintLocal];
+    auto *plist = nb->plist[InteractionLocality::Local];
      freeDeviceBuffer(&plist->sci);
      freeDeviceBuffer(&plist->cj4);
      freeDeviceBuffer(&plist->imask);
@@ -1059,7 +1056,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
      sfree(plist);
      if (nb->bUseTwoStreams)
      {
-        auto *plist_nl = nb->plist[eintNonlocal];
+        auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
          freeDeviceBuffer(&plist_nl->sci);
          freeDeviceBuffer(&plist_nl->cj4);
          freeDeviceBuffer(&plist_nl->imask);
@@ -1078,12 +1075,12 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
      nb->nbst.fshift = nullptr;
  
      /* Free command queues */
-    clReleaseCommandQueue(nb->stream[eintLocal]);
-    nb->stream[eintLocal] = nullptr;
+    clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
+    nb->stream[InteractionLocality::Local] = nullptr;
      if (nb->bUseTwoStreams)
      {
-        clReleaseCommandQueue(nb->stream[eintNonlocal]);
-        nb->stream[eintNonlocal] = nullptr;
+        clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
+        nb->stream[InteractionLocality::NonLocal] = nullptr;
      }
      /* Free other events */
      if (nb->nonlocal_done)
@@ -1112,13 +1109,13 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
  }
  
  //! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_ocl_t *nb)
  {
      return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
  }
  
  //! This function is documented in the header file
-void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+void gpu_reset_timings(nonbonded_verlet_t* nbv)
  {
      if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
      {
@@ -1127,15 +1124,17 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
  }
  
  //! This function is documented in the header file
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+int gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
  {
      return nb != nullptr ?
             gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
  }
  
  //! This function is documented in the header file
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
  {
      return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
              (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
  }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h

index 318e72623dfb20928e3345c7c082333c7fbcb722..c735ab5d489e0c0bb8c25da60c0a39c6b16104e8 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h
@@ -46,6 +46,9 @@
  #ifndef NBNXN_OCL_INTERNAL_H
  #define NBNXN_OCL_INTERNAL_H
  
+namespace Nbnxm
+{
+
  /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
   *
   *  \param[in] vdwType The VdW interaction/implementation type as defined by evdwOcl in nbnxn_ocl_types.h.
@@ -53,4 +56,6 @@
   */
  bool useLjCombRule(int vdwType);
  
+}      // namespace Nbnxm
+
  #endif /* NBNXN_OCL_INTERNAL_H */
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h

index 75279fb3cde931264759b5e694570c98d69f3e33..57d945bdcb870531bb69add3e861eafe194c8814 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -51,7 +51,9 @@
  #include "gromacs/gpu_utils/oclutils.h"
  #include "gromacs/mdtypes/interaction_const.h"
  #include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/utility/enumerationhelpers.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/real.h"
  
@@ -243,12 +245,12 @@ typedef struct cl_nbparam_params
  /*! \internal
   * \brief Pair list data.
   */
-using cl_plist_t = gpu_plist;
+using cl_plist_t = Nbnxm::gpu_plist;
  
  /** \internal
   * \brief Typedef of actual timer type.
   */
-typedef struct nbnxn_gpu_timers_t cl_timers_t;
+typedef struct Nbnxm::gpu_timers_t cl_timers_t;
  
  /*! \internal
   * \brief Main data structure for OpenCL nonbonded force calculations.
@@ -278,15 +280,15 @@ struct gmx_nbnxn_ocl_t
      cl_kernel           kernel_zero_e_fshift;
      ///@}
  
-    cl_bool             bUseTwoStreams;        /**< true if doing both local/non-local NB work on GPU          */
-    cl_bool             bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued   */
+    cl_bool                                                             bUseTwoStreams;        /**< true if doing both local/non-local NB work on GPU          */
+    cl_bool                                                             bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued   */
  
-    cl_atomdata_t      *atdat;                 /**< atom data                                                  */
-    cl_nbparam_t       *nbparam;               /**< parameters required for the non-bonded calc.               */
-    cl_plist_t         *plist[2];              /**< pair-list data structures (local and non-local)            */
-    cl_nb_staging_t     nbst;                  /**< staging area where fshift/energies get downloaded          */
+    cl_atomdata_t                                                      *atdat;                 /**< atom data                                                  */
+    cl_nbparam_t                                                       *nbparam;               /**< parameters required for the non-bonded calc.               */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_plist_t *>     plist;                 /**< pair-list data structures (local and non-local)            */
+    cl_nb_staging_t                                                     nbst;                  /**< staging area where fshift/energies get downloaded          */
  
-    cl_command_queue    stream[2];             /**< local and non-local GPU queues                             */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream;                /**< local and non-local GPU queues                             */
  
      /** events used for synchronization */
      cl_event nonlocal_done;                     /**< event triggered when the non-local non-bonded kernel
diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp

index 17dd8e09fdb5753bbe5913fc5b0c297a92b2895b..16f940a71a23fc093298f8d96ae21b03c562465f 100644 (file)
--- a/src/gromacs/nbnxm/pairlist.cpp
+++ b/src/gromacs/nbnxm/pairlist.cpp
@@ -75,6 +75,8 @@
  
  using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  
+// Convience alias for partial Nbnxn namespace usage
+using InteractionLocality = Nbnxm::InteractionLocality;
  
  /* We shift the i-particles backward for PBC.
   * This leads to more conditionals than shifting forward.
@@ -2650,12 +2652,12 @@ static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, const rvec ls,
  }
  
  /* Estimates the average size of a full j-list for super/sub setup */
-static void get_nsubpair_target(const nbnxn_search   *nbs,
-                                int                   iloc,
-                                real                  rlist,
-                                int                   min_ci_balanced,
-                                int                  *nsubpair_target,
-                                float                *nsubpair_tot_est)
+static void get_nsubpair_target(const nbnxn_search        *nbs,
+                                const InteractionLocality  iloc,
+                                const real                 rlist,
+                                const int                  min_ci_balanced,
+                                int                       *nsubpair_target,
+                                float                     *nsubpair_tot_est)
  {
      /* The target value of 36 seems to be the optimum for Kepler.
       * Maxwell is less sensitive to the exact value.
@@ -2702,7 +2704,7 @@ static void get_nsubpair_target(const nbnxn_search   *nbs,
              nonlocal_vol2(nbs->zones, ls, r_eff_sup);
      }
  
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
      {
          /* Sub-cell interacts with itself */
          vol_est  = ls[XX]*ls[YY]*ls[ZZ];
@@ -4077,15 +4079,15 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
  }
  
  /* Make a local or non-local pair-list, depending on iloc */
-void nbnxn_make_pairlist(nbnxn_search         *nbs,
-                         nbnxn_atomdata_t     *nbat,
-                         const t_blocka       *excl,
-                         real                  rlist,
-                         int                   min_ci_balanced,
-                         nbnxn_pairlist_set_t *nbl_list,
-                         int                   iloc,
-                         int                   nb_kernel_type,
-                         t_nrnb               *nrnb)
+void nbnxn_make_pairlist(nbnxn_search              *nbs,
+                         nbnxn_atomdata_t          *nbat,
+                         const t_blocka            *excl,
+                         const real                 rlist,
+                         const int                  min_ci_balanced,
+                         nbnxn_pairlist_set_t      *nbl_list,
+                         const InteractionLocality  iloc,
+                         const int                  nb_kernel_type,
+                         t_nrnb                    *nrnb)
  {
      int                nsubpair_target;
      float              nsubpair_tot_est;
@@ -4105,13 +4107,13 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
  
      nbat->bUseBufferFlags = (nbat->out.size() > 1);
      /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && LOCAL_I(iloc))
+    if (nbat->bUseBufferFlags && iloc == InteractionLocality::Local)
      {
          init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
      }
  
      int nzi;
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
      {
          /* Only zone (grid) 0 vs 0 */
          nzi = 1;
@@ -4156,7 +4158,7 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
  
          int                 zj0;
          int                 zj1;
-        if (LOCAL_I(iloc))
+        if (iloc == InteractionLocality::Local)
          {
              zj0 = 0;
              zj1 = 1;
@@ -4186,7 +4188,7 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
              /* With GPU: generate progressively smaller lists for
               * load balancing for local only or non-local with 2 zones.
               */
-            progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
+            progBal = (iloc == InteractionLocality::Local || nbs->zones->n <= 2);
  
  #pragma omp parallel for num_threads(nnbl) schedule(static)
              for (int th = 0; th < nnbl; th++)
@@ -4342,12 +4344,12 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
      }
  
      /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
      {
          nbs->search_count++;
      }
      if (nbs->print_cycles &&
-        (!nbs->DomDec || !LOCAL_I(iloc)) &&
+        (!nbs->DomDec || iloc == InteractionLocality::NonLocal) &&
          nbs->search_count % 100 == 0)
      {
          nbs_cycle_print(stderr, nbs);
diff --git a/src/gromacs/nbnxm/pairlistset.h b/src/gromacs/nbnxm/pairlistset.h

index 0e92eb3cde7c16a92a96bdd624a8d2da3a603c4d..885e8f07be42b643b005cb5acab4abe5e1d4d91d 100644 (file)
--- a/src/gromacs/nbnxm/pairlistset.h
+++ b/src/gromacs/nbnxm/pairlistset.h
@@ -40,6 +40,8 @@
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/real.h"
  
+#include "locality.h"
+
  struct gmx_domdec_zones_t;
  struct gmx_groups_t;
  struct nbnxn_atomdata_t;
@@ -86,15 +88,15 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
   * for the number of equally sized lists is below min_ci_balanced.
   * With perturbed particles, also a group scheme style nbl_fep list is made.
   */
-void nbnxn_make_pairlist(nbnxn_search         *nbs,
-                         nbnxn_atomdata_t     *nbat,
-                         const t_blocka       *excl,
-                         real                  rlist,
-                         int                   min_ci_balanced,
-                         nbnxn_pairlist_set_t *nbl_list,
-                         int                   iloc,
-                         int                   nb_kernel_type,
-                         t_nrnb               *nrnb);
+void nbnxn_make_pairlist(nbnxn_search               *nbs,
+                         nbnxn_atomdata_t           *nbat,
+                         const t_blocka             *excl,
+                         real                        rlist,
+                         int                         min_ci_balanced,
+                         nbnxn_pairlist_set_t       *nbl_list,
+                         Nbnxm::InteractionLocality  iloc,
+                         int                         nb_kernel_type,
+                         t_nrnb                     *nrnb);
  
  /*! \brief Prepare the list-set produced by the search for dynamic pruning
   *
diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp

index 96a188370124a7c32c673acb58330f45b509d2a2..76938e744df0acb5190a0dc754b6607d840c573d 100644 (file)
--- a/src/gromacs/nbnxm/prunekerneldispatch.cpp
+++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp
@@ -45,9 +45,9 @@
  #include "kernels_simd_4xm/kernel_prune.h"
  
  
-void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
-                              int                 ilocality,
-                              const rvec         *shift_vec)
+void NbnxnDispatchPruneKernel(nonbonded_verlet_t               *nbv,
+                              const Nbnxm::InteractionLocality  ilocality,
+                              const rvec                       *shift_vec)
  {
      nonbonded_verlet_group_t &nbvg       = nbv->grp[ilocality];
      nbnxn_pairlist_set_t     *nbl_lists  = &nbvg.nbl_lists;
author	Berk Hess <hess@kth.se>
	Tue, 15 Jan 2019 08:35:04 +0000 (09:35 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Sun, 17 Feb 2019 18:25:33 +0000 (19:25 +0100)
src/gromacs/domdec/partition.cpp		patch \| blob \| history
src/gromacs/ewald/pme_load_balancing.cpp		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/resethandler.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_common.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_common_utils.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_data_mgmt.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_types_common.h		patch \| blob \| history
src/gromacs/nbnxm/kerneldispatch.cpp		patch \| blob \| history
src/gromacs/nbnxm/locality.h	[new file with mode: 0644]	patch \| blob
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_setup.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h		patch \| blob \| history
src/gromacs/nbnxm/pairlist.cpp		patch \| blob \| history
src/gromacs/nbnxm/pairlistset.h		patch \| blob \| history
src/gromacs/nbnxm/prunekerneldispatch.cpp		patch \| blob \| history